import xmldataset
import pandas as pd
import re
import os
from datetime import datetime

# TODO: make this dynamic in the script that iterates over files in folders
input_file = 'Downloads/zade_30_cat_20180306_000530.xml'
account = '1009'
#################


# profile for xmldataset library - should not be changed
profile = """
TBCATALOG
    PRODUCTDATA
        PRODUCT
            P_CATEGORIES
                P_CATEGORY = external_dataset:product_information
            ARTICLEDATA
                ARTICLE
                    A_NR = dataset:articles
                    A_ID = dataset:articles
                    A_STOCK = dataset:articles
                    A_PRICEDATA
                        A_PRICE
                            A_VK = dataset:articles
                    __EXTERNAL_VALUE__ = product_information:P_CATEGORY:articles
"""

def is_stealth_file(filename):
    # used to check if file should be processed
    # e.g. zade_30_cat_20180305_224136_stealth.xml is a stealth file
    res = re.search('.+_stealth.+\.xml', filename)
    
    if res is None:
        return False
    
    return True
    

def get_channel_from_filename(filename):
    # extract the channel from the filename
    # e.g. zade_30_cat_20180305_224136.xml has channel 30
    return re.search('zade_(?P<channel>\d+)_.*', filename).group('channel')
    

def get_output_filename(account, original_filename, timestring=None):
    # generates the filename of the output csv file
    # note that if timestring is not specified, the current time is used
    # e.g. <DATE 2018-03-05>_<ACCOUNT 1009>_<original filename>.csv
    if timestring is None:
        timestring = datetime.now().strftime('%Y-%m-%d')
        
    return '{timestring}_{account}_{original_filename}.csv'.format(
            timestring=timestring,
            account=account,
            original_filename=original_filename)
    

def write_file(input_file, output_file, account):
    xml = open(input_file).read()
    result = xmldataset.parse_using_profile(xml, profile)
    
    filename = os.path.basename(input_file)
    channel = get_channel_from_filename(filename)
    
    df = pd.DataFrame.from_records(result['articles'])
    df['CHANNEL'] = channel
    df['ACCOUNT'] = account
    
    # Order the data so it's the same accross all files
    df = df[['A_NR', 'A_ID', 'A_STOCK', 'A_VK', 'CHANNEL', 'ACCOUNT']]
    
    df.to_csv(output_file, index=False)


# TODO: turn this into a script that takes 2 commandline arguments;
# 1. starting directory - used as root from where all the subdirectories of format
#    /1009_outfit/tbcat/out_save/2018/03/05 are situated
# 2. output directory - where all the output files are saved
# NOTE: keep track on what the current account is (e.g. 1009) so it can be dynamically set in filename and output file
    
# for every input file found, run this:
original_filename =  os.path.basename(input_file)
output_file = get_output_filename(account, original_filename)
write_file(input_file, output_file, account) 
# supports a unix path with directories included
# e.g.  backup/1009_outfit/tbcat/out_save/2018/03/05/zade_30_cat_20180305_224136.xml