#!/usr/bin/env python3
import os
import re
import sys
import tempfile
import urllib.request
from xml.etree import ElementTree as ET
from html.parser import HTMLParser
from datetime import datetime, timedelta, timezone
import time
import argparse
import shutil

os.environ["PATH"] = "/usr/local/sbin:/bin:/sbin:/usr/bin:/usr/sbin"


def parse_args():
    parser = argparse.ArgumentParser(description="Sync ADM Tabacchi data.")
    parser.add_argument("--force", action="store_true", help="Force synchronization regardless of last sync time.")
    parser.add_argument("--debug", action="store_true", help="Enable debug mode.")
    return parser.parse_args()

args = parse_args()

def deb(*msg):
    if args.debug:
        print(*msg, file=sys.stderr)

class DictTreeHTMLParser(HTMLParser):
    def __init__(self):
        HTMLParser.__init__(self)
        self.stack = []
        self.root = {'tag': None, 'attrs': {}, 'children': []}
        self.current = self.root

    def handle_starttag(self, tag, attrs):
        element = {
            'tag': tag,
            'attrs': dict(attrs),
            'children': []
        }
        self.current['children'].append(element)
        self.stack.append(self.current)
        self.current = element

    def handle_endtag(self, tag):
        if self.stack:
            self.current = self.stack.pop()

    def handle_data(self, data):
        text = data.strip()
        if text:
            self.current['children'].append(text)

def parse_html_to_dict_tree(html):
    parser = DictTreeHTMLParser()
    parser.feed(html)
    return parser.root

def find_subtree(tree, tag, class_name=None):
    """
    Recursively search tree for all elements with given tag.
    If class_name is given, match only elements whose class attribute
    includes the class_name (space-separated).
    Returns the first matching subtrees (dict).
    """
    if tree is None:
        return None
    matches = []

    def _search(node):
        if not isinstance(node, dict):
            return
        if node.get('tag') == tag:
            if class_name is None:
                matches.append(node)
            else:
                cls = node['attrs'].get('class', '')
                classes = cls.split(' ')
                if class_name in classes:
                    matches.append(node)
        for child in node.get('children', []):
            _search(child)

    _search(tree)
    
    return matches if len(matches) > 0 else [None]

def find_text(tree, tag, class_name=None):
    """
    Find the text content of the first subtree matching the given tag and class_name.
    Returns the text content or empty text if no match is found.
    """
    if tree is None:
        return ""
    subtree = find_subtree(tree, tag, class_name)
    return get_text(subtree[0])

def get_text(tree):
    if not isinstance(tree, dict):
        return ""
    texts=[child for child in tree['children'] if isinstance(child, str)]
    return ' '.join(texts).strip()







def shell_escape(string):
    return string.replace("\\","\\\\").replace("'", "'\"'\"'")

def shell(command):
    with os.popen(command) as f:
        return f.read()

def download_url(url):
    deb("Downloading:", url)
    try:
        urllib.request.urlopen(url)
        return urllib.request.urlopen(url)
    except urllib.error.URLError as e:
        print("Error downloading {}: {}".format(url, e), file=sys.stderr)
        exit(1)

def download_all():
    list_data = []
    host = "https://www.adm.gov.it"
    res = download_url(host+"/portale/monopoli/tabacchi/prezzi/prezzi_pubblico").read().decode('utf-8')
    if not res:
        return {"result": "ERROR", "message": "Failed to download the main page."}
#    print(res)        

    tree = parse_html_to_dict_tree(res)
    content = find_subtree(tree, 'div', 'asset-content')
    ul= find_subtree(content[0], 'ul')
    li= find_subtree(ul[0], 'li')
    deb("List found:", li)
    for item in li:
        a = find_subtree(item, 'a')
        if a and len(a) > 0:
            pdfurl = host + a[0]['attrs']['href']
            lbl = get_text(a[0])
            if re.search(r"\.pdf", pdfurl):
                res = download_listino(pdfurl, lbl)
                if res['result'] != "OK":
                    return res
                list_data.extend(res['articles'])
    return {"result": "OK", "articles": list_data}

def download_listino(pdfurl, lbl):
    tmpdir = tempfile.mkdtemp()
    tmpfile = tmpdir+"/pdf.pdf"
    xmlfile = tmpfile+".xml"
    try:
        response = download_url(pdfurl)
        with open(tmpfile, 'wb') as f:
            f.write(response.read())
        
        cmdline="pdftohtml -xml -nodrm '{}' '{}'".format(shell_escape(tmpfile), shell_escape(xmlfile))
        deb("Executing:", cmdline)
        os.system(cmdline)
        tab = parse_xml(xmlfile)
        tab = sorted(tab.items(), key=lambda x: int(x[0]))
        lines = [line[1] for line in tab]
        articles = []
        for line in lines:
            line = sorted(line.items(), key=lambda x: int(x[0]))
            vals = [v[1] for v in line]
            if len(vals) == 5:
                articles.append([vals[0].strip(), vals[1].strip(), lbl.strip(), vals[2].strip(), vals[4].replace(",", ".").strip()])
#                print("adding", "\t".join(articles[-1]))
        return {"result": "OK", "articles": articles}
    finally:
        if os.path.exists(tmpdir) and not args.debug:
            shutil.rmtree(tmpdir)

def parse_xml(xmlfile):
    text = {}
    index = []
    page = ln = col = 0
    tree = ET.parse(xmlfile)
    root = tree.getroot()
    for elem in root.iter():
        if elem.tag == "page":
            page = int(elem.attrib.get("number", 0))
        elif elem.tag == "text":
            ln = 10000 * page + int(elem.attrib.get("top", 0))
            col = int(elem.attrib.get("left", 0))
            if elem.text and col > 0:
                ln = find_and_regist_line(index, ln)
                if ln not in text:
                    text[ln] = {}
                text[ln][col] = elem.text
    return text

def find_and_regist_line(index, ln):
    delta = 5
    if index and abs(index[-1] - ln) < delta:
        return index[-1]
    index.append(ln)
    return ln


def save_to_db(articles):
    def sql_escape(s):
        """
        Escape single quotes in a string for SQL insertion.
        """
        return s.replace("\\", "\\\\'").replace("'", "\\'")

    try:
        mysql= os.popen("mysql -u euro3g -peuro3g eurotest", "w")
        log = open("/tmp/tabacchi.log", "w")
        q="""
            DELETE FROM adm_tabacchi;
            INSERT INTO adm_tabacchi (extcode, descr, opt_info, pkg_info, price) VALUES
            """
        mysql.write(q)
        log.write(q)
        sep = ""
        acrticlemap = {}
        for article in articles:
            if article[0] in acrticlemap:
                print("Duplicate article code:", article[0], file=sys.stderr)
                continue
            acrticlemap[article[0]] = True
            extcode = sql_escape(article[0])
            descr = sql_escape(article[1])
            opt_info = sql_escape(article[2])
            pkg_info = sql_escape(article[3])
            price = sql_escape(article[4])
            # price can have a dot between thousand and hundred, and second dot as decimal separator
            # ('19964', 'BOLIVAR BELICOSOS FINOS RESERVA COSECHA 2016', 'Sigari - pdf', 'da 20 pezzi', 2.100.00),
            price = re.sub(r'\.(\d{3})', r'\1', price).replace(',', '.')
            q="{}('{}', '{}', '{}', '{}', {})".format(sep,extcode, descr, opt_info, pkg_info, price)
            mysql.write(q)
            log.write(q)
            sep = ",\n"
        mysql.write(";")
        log.write(";\n")
        mysql.close()
        log.close()

    except Exception as e:
        print("Error: ",e, file=sys.stderr)
        exit(1)

def get_config(key):
    return shell("systools/get_eurodb_config_val {}".format(shell_escape(key))).strip()

def has_support():
    cur_sec = int(datetime.now().timestamp())
    mnt_date = get_config("MaintenanceExpireDate")
    try:
        mnt_sec = int(datetime.strptime(mnt_date, "%Y-%m-%d").replace(tzinfo=timezone.utc).timestamp())
        dif_days = (mnt_sec - cur_sec) // 86400
        return dif_days >= -45
    except ValueError:
        deb("Invalid MaintenanceExpireDate format:", mnt_date)
        return False

if __name__ == "__main__":
    os.chdir("/opt/euro-beta")
    if not has_support():
        deb("No support available. Exiting.")
        exit(0)
    is_client=get_config("SpinKeyClient")
    if is_client == "1":
        deb("This is a client system. Exiting.")
        exit(0)
    last_sync = get_config("AdmTabacchiLastSync")
    try:
        last_sync_time=int(last_sync)
        if datetime.now() - datetime.fromtimestamp(last_sync_time) < timedelta(hours=1) and not args.force:
            # print("Last synchronization was less than an hour ago. Exiting.")
            exit(0)
    except ValueError:
        pass
        # print("Invalid date format for last synchronization. Proceeding with synchronization.")
    current_time = int(datetime.now().timestamp())
    result = download_all()
    if result['result'] == "OK":
#        print("Download completed successfully.")
        save_to_db(result['articles'])
        shell("systools/set_eurodb_config_val AdmTabacchiLastSync string '{}'".format(current_time))
    else:
        print("Error:", result.get('message', 'Unknown error'), file=sys.stderr)
        print("Download failed.", file=sys.stderr) 
        exit(1)   




# example content
"""
[
    {
        'tag': 'div',
        'attrs': {'class': 'asset-content mb-3'},
        'children': [
            {
                'tag': 'div',
                'attrs': {
                    'class': 'journal-content-article ',
                    'data-analytics-asset-id': '1957138',
                    'data-analytics-asset-title': 'elenco prezzi pubblico',
                    'data-analytics-asset-type': 'web-content'
                },
                'children': [
                    {
                        'tag': 'div',
                        'attrs': {'class': 'py-5'},
                        'children': [
                            {
                                'tag': 'div',
                                'attrs': {'class': 'container'},
                                'children': [
                                    {
                                        'tag': 'ul',
                                        'attrs': {'class': 'list-link pl-0 pb-4'},
                                        'children': [
                                            {
                                                'tag': 'li',
                                                'attrs': {'class': 'd-flex text-adm mb-4'},
                                                'children': [
                                                    {
                                                        'tag': 'span',
                                                        'attrs': {'class': 'col-auto pl-2 fas fa-paperclip text-24'},
                                                        'children': []
                                                    },
                                                    {
                                                        'tag': 'div',
                                                        'attrs': {},
                                                        'children': [
                                                            {
                                                                'tag': 'a',
                                                                'attrs': {
                                                                    'aria-label': 'Scarica il documento Sigarette - pdf',
                                                                    'href': '/portale/documents/20182/1106899/1-AGG+LIST+SIGARETTE.pdf/9632b2a3-ccd8-1592-d6a2-9a020686a06f?t=1753352260432'
                                                                },
                                                                'children': ['Sigarette - pdf']
                                                            },
                                                            '- aggiornato il 24/07/2025'
                                                        ]
                                                    }
                                                ]
                                            },
                                            {
                                                'tag': 'li',
                                                'attrs': {'class': 'd-flex text-adm mb-4'},
                                                'children': [
                                                    {
                                                        'tag': 'span',
                                                        'attrs': {'class': 'col-auto pl-2 fas fa-paperclip text-24'},
                                                        'children': []
                                                    },
                                                    {
                                                        'tag': 'div',
                                                        'attrs': {},
                                                        'children': [
                                                            {
                                                                'tag': 'a',
                                                                'attrs': {
                                                                    'aria-label': 'Scarica il documento Sigari - pdf',
                                                                    'href': '/portale/documents/20182/1106899/2-LIST+SIGARI.pdf/386887ba-9526-75b2-c461-f7b04bb2cdea?t=1754458582711'
                                                                },
                                                                'children': ['Sigari - pdf']
                                                            },
                                                            '- aggiornato il 06/08/2025'
                                                        ]
                                                    }
                                                ]
                                            },
                                            {
                                                'tag': 'li',
                                                'attrs': {'class': 'd-flex text-adm mb-4'},
                                                'children': [
                                                    {
                                                        'tag': 'span',
                                                        'attrs': {'class': 'col-auto pl-2 fas fa-paperclip text-24'},
                                                        'children': []
                                                    },
                                                    {
                                                        'tag': 'div',
                                                        'attrs': {},
                                                        'children': [
                                                            {
                                                                'tag': 'a',
                                                                'attrs': {
                                                                    'aria-label': 'Scarica il documento Sigaretti',
                                                                    'href': '/portale/documents/20182/1106899/3-AGG+LIST+SIGARETTI.pdf/a854186e-2286-1f16-723f-99ed37f5acbe?t=1754458629247'
                                                                },
                                                                'children': ['Sigaretti']
                                                            },
                                                            '- aggiornato il 06/08/2025'
                                                        ]
                                                    }
                                                ]
                                            },
                                            {
                                                'tag': 'li',
                                                'attrs': {'class': 'd-flex text-adm mb-4'},
                                                'children': [
                                                    {
                                                        'tag': 'span',
                                                        'attrs': {'class': 'col-auto pl-2 fas fa-paperclip text-24'},
                                                        'children': []
                                                    },
                                                    {
                                                        'tag': 'div',
                                                        'attrs': {},
                                                        'children': [
                                                            {
                                                                'tag': 'a',
                                                                'attrs': {
                                                                    'aria-label': 'Scarica il documento Fiuto e mastico',
                                                                    'href': '/portale/documents/20182/1106899/6-LIST+FIUTO27032025.pdf/39984a1d-f097-a966-e242-d52cb9fdd28e?t=1743059956188'
                                                                },
                                                                'children': ['Fiuto e mastico']
                                                            },
                                                            '- aggiornato il 27/03/2025'
                                                        ]
                                                    }
                                                ]
                                            },
                                            {
                                                'tag': 'li',
                                                'attrs': {'class': 'd-flex text-adm mb-4'},
                                                'children': [
                                                    {
                                                        'tag': 'span',
                                                        'attrs': {'class': 'col-auto pl-2 fas fa-paperclip text-24'},
                                                        'children': []
                                                    },
                                                    {
                                                        'tag': 'div',
                                                        'attrs': {},
                                                        'children': [
                                                            {
                                                                'tag': 'a',
                                                                'attrs': {
                                                                    'aria-label': 'Scarica il documento Trinciati per sigaretta',
                                                                    'href': '/portale/documents/20182/1106899/3-AGG+LIST+TRINCIATO.pdf/3df140a5-7d0b-adb8-d136-3997f8e49aa3?t=1751626735827'
                                                                },
                                                                'children': ['Trinciati per sigaretta']
                                                            },
                                                            '- aggiornato il 04/07/2025'
                                                        ]
                                                    }
                                                ]
                                            },
                                            {
                                                'tag': 'li',
                                                'attrs': {'class': 'd-flex text-adm mb-4'},
                                                'children': [
                                                    {
                                                        'tag': 'span',
                                                        'attrs': {'class': 'col-auto pl-2 fas fa-paperclip text-24'},
                                                        'children': []
                                                    },
                                                    {
                                                        'tag': 'div',
                                                        'attrs': {},
                                                        'children': [
                                                            {
                                                                'tag': 'a',
                                                                'attrs': {
                                                                    'aria-label': 'Scarica il documento Altri tabacchi da fumo - pdf',
                                                                    'href': '/portale/documents/20182/1106899/2-LIST+ALTRI.pdf/75a11193-eaaa-bd6e-f754-1def21fc17b2?t=1752071463573'
                                                                },
                                                                'children': ['Altri tabacchi da fumo - pdf']
                                                            },
                                                            '- aggiornato il 09/07/2025'
                                                        ]
                                                    }
                                                ]
                                            },
                                            {
                                                'tag': 'li',
                                                'attrs': {'class': 'd-flex text-adm mb-4'},
                                                'children': [
                                                    {
                                                        'tag': 'span',
                                                        'attrs': {'class': 'col-auto pl-2 fas fa-paperclip text-24'},
                                                        'children': []
                                                    },
                                                    {
                                                        'tag': 'div',
                                                        'attrs': {},
                                                        'children': [
                                                            {
                                                                'tag': 'a',
                                                                'attrs': {
                                                                    'aria-label': 'Scarica il documento Prodotti da inalazione senza combustione',
                                                                    'href': '/portale/documents/20182/1106899/1-AGG+LIST+HTP.pdf/220da906-73fb-c4e4-3e99-6b300ab075a9?t=1754470006777'
                                                                },
                                                                'children': ['Prodotti da inalazione senza combustione']
                                                            },
                                                            '– aggiornato il 06/08/2025'
                                                        ]
                                                    }
                                                ]
                                            }
                                        ]
                                    }
                                ]
                            }
                        ]
                    }
                ]
            }
        ]
    }
]
"""


