Skip to content

Instantly share code, notes, and snippets.

@gcetusic
Created March 6, 2018 15:57
Show Gist options
  • Save gcetusic/a62ef19f1aa88f5558b3adf321fa43fd to your computer and use it in GitHub Desktop.
Save gcetusic/a62ef19f1aa88f5558b3adf321fa43fd to your computer and use it in GitHub Desktop.
import xmldataset
import pandas as pd
import re
import os
from datetime import datetime
# TODO: make this dynamic in the script that iterates over files in folders
input_file = 'Downloads/zade_30_cat_20180306_000530.xml'
account = '1009'
#################
# profile for xmldataset library - should not be changed
profile = """
TBCATALOG
PRODUCTDATA
PRODUCT
P_CATEGORIES
P_CATEGORY = external_dataset:product_information
ARTICLEDATA
ARTICLE
A_NR = dataset:articles
A_ID = dataset:articles
A_STOCK = dataset:articles
A_PRICEDATA
A_PRICE
A_VK = dataset:articles
__EXTERNAL_VALUE__ = product_information:P_CATEGORY:articles
"""
def is_stealth_file(filename):
# used to check if file should be processed
# e.g. zade_30_cat_20180305_224136_stealth.xml is a stealth file
res = re.search('.+_stealth.+\.xml', filename)
if res is None:
return False
return True
def get_channel_from_filename(filename):
# extract the channel from the filename
# e.g. zade_30_cat_20180305_224136.xml has channel 30
return re.search('zade_(?P<channel>\d+)_.*', filename).group('channel')
def get_output_filename(account, original_filename, timestring=None):
# generates the filename of the output csv file
# note that if timestring is not specified, the current time is used
# e.g. <DATE 2018-03-05>_<ACCOUNT 1009>_<original filename>.csv
if timestring is None:
timestring = datetime.now().strftime('%Y-%m-%d')
return '{timestring}_{account}_{original_filename}.csv'.format(
timestring=timestring,
account=account,
original_filename=original_filename)
def write_file(input_file, output_file, account):
xml = open(input_file).read()
result = xmldataset.parse_using_profile(xml, profile)
filename = os.path.basename(input_file)
channel = get_channel_from_filename(filename)
df = pd.DataFrame.from_records(result['articles'])
df['CHANNEL'] = channel
df['ACCOUNT'] = account
# Order the data so it's the same accross all files
df = df[['A_NR', 'A_ID', 'A_STOCK', 'A_VK', 'CHANNEL', 'ACCOUNT']]
df.to_csv(output_file, index=False)
# TODO: turn this into a script that takes 2 commandline arguments;
# 1. starting directory - used as root from where all the subdirectories of format
# /1009_outfit/tbcat/out_save/2018/03/05 are situated
# 2. output directory - where all the output files are saved
# NOTE: keep track on what the current account is (e.g. 1009) so it can be dynamically set in filename and output file
# for every input file found, run this:
original_filename = os.path.basename(input_file)
output_file = get_output_filename(account, original_filename)
write_file(input_file, output_file, account)
# supports a unix path with directories included
# e.g. backup/1009_outfit/tbcat/out_save/2018/03/05/zade_30_cat_20180305_224136.xml
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment