import xmldataset import pandas as pd import re import os from datetime import datetime # TODO: make this dynamic in the script that iterates over files in folders input_file = 'Downloads/zade_30_cat_20180306_000530.xml' account = '1009' ################# # profile for xmldataset library - should not be changed profile = """ TBCATALOG PRODUCTDATA PRODUCT P_CATEGORIES P_CATEGORY = external_dataset:product_information ARTICLEDATA ARTICLE A_NR = dataset:articles A_ID = dataset:articles A_STOCK = dataset:articles A_PRICEDATA A_PRICE A_VK = dataset:articles __EXTERNAL_VALUE__ = product_information:P_CATEGORY:articles """ def is_stealth_file(filename): # used to check if file should be processed # e.g. zade_30_cat_20180305_224136_stealth.xml is a stealth file res = re.search('.+_stealth.+\.xml', filename) if res is None: return False return True def get_channel_from_filename(filename): # extract the channel from the filename # e.g. zade_30_cat_20180305_224136.xml has channel 30 return re.search('zade_(?P\d+)_.*', filename).group('channel') def get_output_filename(account, original_filename, timestring=None): # generates the filename of the output csv file # note that if timestring is not specified, the current time is used # e.g. __.csv if timestring is None: timestring = datetime.now().strftime('%Y-%m-%d') return '{timestring}_{account}_{original_filename}.csv'.format( timestring=timestring, account=account, original_filename=original_filename) def write_file(input_file, output_file, account): xml = open(input_file).read() result = xmldataset.parse_using_profile(xml, profile) filename = os.path.basename(input_file) channel = get_channel_from_filename(filename) df = pd.DataFrame.from_records(result['articles']) df['CHANNEL'] = channel df['ACCOUNT'] = account # Order the data so it's the same accross all files df = df[['A_NR', 'A_ID', 'A_STOCK', 'A_VK', 'CHANNEL', 'ACCOUNT']] df.to_csv(output_file, index=False) # TODO: turn this into a script that takes 2 commandline arguments; # 1. starting directory - used as root from where all the subdirectories of format # /1009_outfit/tbcat/out_save/2018/03/05 are situated # 2. output directory - where all the output files are saved # NOTE: keep track on what the current account is (e.g. 1009) so it can be dynamically set in filename and output file # for every input file found, run this: original_filename = os.path.basename(input_file) output_file = get_output_filename(account, original_filename) write_file(input_file, output_file, account) # supports a unix path with directories included # e.g. backup/1009_outfit/tbcat/out_save/2018/03/05/zade_30_cat_20180305_224136.xml