Created
March 6, 2018 15:57
-
-
Save gcetusic/a62ef19f1aa88f5558b3adf321fa43fd to your computer and use it in GitHub Desktop.
Revisions
-
gcetusic created this gist
Mar 6, 2018 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,89 @@ import xmldataset import pandas as pd import re import os from datetime import datetime # TODO: make this dynamic in the script that iterates over files in folders input_file = 'Downloads/zade_30_cat_20180306_000530.xml' account = '1009' ################# # profile for xmldataset library - should not be changed profile = """ TBCATALOG PRODUCTDATA PRODUCT P_CATEGORIES P_CATEGORY = external_dataset:product_information ARTICLEDATA ARTICLE A_NR = dataset:articles A_ID = dataset:articles A_STOCK = dataset:articles A_PRICEDATA A_PRICE A_VK = dataset:articles __EXTERNAL_VALUE__ = product_information:P_CATEGORY:articles """ def is_stealth_file(filename): # used to check if file should be processed # e.g. zade_30_cat_20180305_224136_stealth.xml is a stealth file res = re.search('.+_stealth.+\.xml', filename) if res is None: return False return True def get_channel_from_filename(filename): # extract the channel from the filename # e.g. zade_30_cat_20180305_224136.xml has channel 30 return re.search('zade_(?P<channel>\d+)_.*', filename).group('channel') def get_output_filename(account, original_filename, timestring=None): # generates the filename of the output csv file # note that if timestring is not specified, the current time is used # e.g. <DATE 2018-03-05>_<ACCOUNT 1009>_<original filename>.csv if timestring is None: timestring = datetime.now().strftime('%Y-%m-%d') return '{timestring}_{account}_{original_filename}.csv'.format( timestring=timestring, account=account, original_filename=original_filename) def write_file(input_file, output_file, account): xml = open(input_file).read() result = xmldataset.parse_using_profile(xml, profile) filename = os.path.basename(input_file) channel = get_channel_from_filename(filename) df = pd.DataFrame.from_records(result['articles']) df['CHANNEL'] = channel df['ACCOUNT'] = account # Order the data so it's the same accross all files df = df[['A_NR', 'A_ID', 'A_STOCK', 'A_VK', 'CHANNEL', 'ACCOUNT']] df.to_csv(output_file, index=False) # TODO: turn this into a script that takes 2 commandline arguments; # 1. starting directory - used as root from where all the subdirectories of format # /1009_outfit/tbcat/out_save/2018/03/05 are situated # 2. output directory - where all the output files are saved # NOTE: keep track on what the current account is (e.g. 1009) so it can be dynamically set in filename and output file # for every input file found, run this: original_filename = os.path.basename(input_file) output_file = get_output_filename(account, original_filename) write_file(input_file, output_file, account) # supports a unix path with directories included # e.g. backup/1009_outfit/tbcat/out_save/2018/03/05/zade_30_cat_20180305_224136.xml