Created
March 6, 2018 15:57
-
-
Save gcetusic/a62ef19f1aa88f5558b3adf321fa43fd to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import xmldataset | |
| import pandas as pd | |
| import re | |
| import os | |
| from datetime import datetime | |
| # TODO: make this dynamic in the script that iterates over files in folders | |
| input_file = 'Downloads/zade_30_cat_20180306_000530.xml' | |
| account = '1009' | |
| ################# | |
| # profile for xmldataset library - should not be changed | |
| profile = """ | |
| TBCATALOG | |
| PRODUCTDATA | |
| PRODUCT | |
| P_CATEGORIES | |
| P_CATEGORY = external_dataset:product_information | |
| ARTICLEDATA | |
| ARTICLE | |
| A_NR = dataset:articles | |
| A_ID = dataset:articles | |
| A_STOCK = dataset:articles | |
| A_PRICEDATA | |
| A_PRICE | |
| A_VK = dataset:articles | |
| __EXTERNAL_VALUE__ = product_information:P_CATEGORY:articles | |
| """ | |
| def is_stealth_file(filename): | |
| # used to check if file should be processed | |
| # e.g. zade_30_cat_20180305_224136_stealth.xml is a stealth file | |
| res = re.search('.+_stealth.+\.xml', filename) | |
| if res is None: | |
| return False | |
| return True | |
| def get_channel_from_filename(filename): | |
| # extract the channel from the filename | |
| # e.g. zade_30_cat_20180305_224136.xml has channel 30 | |
| return re.search('zade_(?P<channel>\d+)_.*', filename).group('channel') | |
| def get_output_filename(account, original_filename, timestring=None): | |
| # generates the filename of the output csv file | |
| # note that if timestring is not specified, the current time is used | |
| # e.g. <DATE 2018-03-05>_<ACCOUNT 1009>_<original filename>.csv | |
| if timestring is None: | |
| timestring = datetime.now().strftime('%Y-%m-%d') | |
| return '{timestring}_{account}_{original_filename}.csv'.format( | |
| timestring=timestring, | |
| account=account, | |
| original_filename=original_filename) | |
| def write_file(input_file, output_file, account): | |
| xml = open(input_file).read() | |
| result = xmldataset.parse_using_profile(xml, profile) | |
| filename = os.path.basename(input_file) | |
| channel = get_channel_from_filename(filename) | |
| df = pd.DataFrame.from_records(result['articles']) | |
| df['CHANNEL'] = channel | |
| df['ACCOUNT'] = account | |
| # Order the data so it's the same accross all files | |
| df = df[['A_NR', 'A_ID', 'A_STOCK', 'A_VK', 'CHANNEL', 'ACCOUNT']] | |
| df.to_csv(output_file, index=False) | |
| # TODO: turn this into a script that takes 2 commandline arguments; | |
| # 1. starting directory - used as root from where all the subdirectories of format | |
| # /1009_outfit/tbcat/out_save/2018/03/05 are situated | |
| # 2. output directory - where all the output files are saved | |
| # NOTE: keep track on what the current account is (e.g. 1009) so it can be dynamically set in filename and output file | |
| # for every input file found, run this: | |
| original_filename = os.path.basename(input_file) | |
| output_file = get_output_filename(account, original_filename) | |
| write_file(input_file, output_file, account) | |
| # supports a unix path with directories included | |
| # e.g. backup/1009_outfit/tbcat/out_save/2018/03/05/zade_30_cat_20180305_224136.xml |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment