gcetusic · March 6, 2018 15:57
diff --git a/tbcat2csv.py b/tbcat2csv.py
 import xmldataset
 import pandas as pd
 import re
 import os
 from datetime import datetime

 # TODO: make this dynamic in the script that iterates over files in folders
 input_file = 'Downloads/zade_30_cat_20180306_000530.xml'
 account = '1009'
 #################


 # profile for xmldataset library - should not be changed
 profile = """
 TBCATALOG
    PRODUCTDATA
        PRODUCT
            P_CATEGORIES
                P_CATEGORY = external_dataset:product_information
            ARTICLEDATA
                ARTICLE
                    A_NR = dataset:articles
                    A_ID = dataset:articles
                    A_STOCK = dataset:articles
                    A_PRICEDATA
                        A_PRICE
                            A_VK = dataset:articles
                    __EXTERNAL_VALUE__ = product_information:P_CATEGORY:articles
 """

 def is_stealth_file(filename):
    # used to check if file should be processed
    # e.g. zade_30_cat_20180305_224136_stealth.xml is a stealth file
    res = re.search('.+_stealth.+\.xml', filename)
    
    if res is None:
        return False
    
    return True
    

 def get_channel_from_filename(filename):
    # extract the channel from the filename
    # e.g. zade_30_cat_20180305_224136.xml has channel 30
    return re.search('zade_(?P<channel>\d+)_.*', filename).group('channel')
    

 def get_output_filename(account, original_filename, timestring=None):
    # generates the filename of the output csv file
    # note that if timestring is not specified, the current time is used
    # e.g. <DATE 2018-03-05>_<ACCOUNT 1009>_<original filename>.csv
    if timestring is None:
        timestring = datetime.now().strftime('%Y-%m-%d')
        
    return '{timestring}_{account}_{original_filename}.csv'.format(
            timestring=timestring,
            account=account,
            original_filename=original_filename)
    

 def write_file(input_file, output_file, account):
    xml = open(input_file).read()
    result = xmldataset.parse_using_profile(xml, profile)
    
    filename = os.path.basename(input_file)
    channel = get_channel_from_filename(filename)
    
    df = pd.DataFrame.from_records(result['articles'])
    df['CHANNEL'] = channel
    df['ACCOUNT'] = account
    
    # Order the data so it's the same accross all files
    df = df[['A_NR', 'A_ID', 'A_STOCK', 'A_VK', 'CHANNEL', 'ACCOUNT']]
    
    df.to_csv(output_file, index=False)


 # TODO: turn this into a script that takes 2 commandline arguments;
 # 1. starting directory - used as root from where all the subdirectories of format
 #    /1009_outfit/tbcat/out_save/2018/03/05 are situated
 # 2. output directory - where all the output files are saved
 # NOTE: keep track on what the current account is (e.g. 1009) so it can be dynamically set in filename and output file
    
 # for every input file found, run this:
 original_filename =  os.path.basename(input_file)
 output_file = get_output_filename(account, original_filename)
 write_file(input_file, output_file, account) 
 # supports a unix path with directories included
 # e.g.  backup/1009_outfit/tbcat/out_save/2018/03/05/zade_30_cat_20180305_224136.xml
	import xmldataset
	import pandas as pd
	import re
	import os
	from datetime import datetime

	# TODO: make this dynamic in the script that iterates over files in folders
	input_file = 'Downloads/zade_30_cat_20180306_000530.xml'
	account = '1009'
	#################


	# profile for xmldataset library - should not be changed
	profile = """
	TBCATALOG
	PRODUCTDATA
	PRODUCT
	P_CATEGORIES
	P_CATEGORY = external_dataset:product_information
	ARTICLEDATA
	ARTICLE
	A_NR = dataset:articles
	A_ID = dataset:articles
	A_STOCK = dataset:articles
	A_PRICEDATA
	A_PRICE
	A_VK = dataset:articles
	__EXTERNAL_VALUE__ = product_information:P_CATEGORY:articles
	"""

	def is_stealth_file(filename):
	# used to check if file should be processed
	# e.g. zade_30_cat_20180305_224136_stealth.xml is a stealth file
	res = re.search('.+_stealth.+\.xml', filename)

	if res is None:
	return False

	return True


	def get_channel_from_filename(filename):
	# extract the channel from the filename
	# e.g. zade_30_cat_20180305_224136.xml has channel 30
	return re.search('zade_(?P<channel>\d+)_.*', filename).group('channel')


	def get_output_filename(account, original_filename, timestring=None):
	# generates the filename of the output csv file
	# note that if timestring is not specified, the current time is used
	# e.g. <DATE 2018-03-05>_<ACCOUNT 1009>_<original filename>.csv
	if timestring is None:
	timestring = datetime.now().strftime('%Y-%m-%d')

	return '{timestring}_{account}_{original_filename}.csv'.format(
	timestring=timestring,
	account=account,
	original_filename=original_filename)


	def write_file(input_file, output_file, account):
	xml = open(input_file).read()
	result = xmldataset.parse_using_profile(xml, profile)

	filename = os.path.basename(input_file)
	channel = get_channel_from_filename(filename)

	df = pd.DataFrame.from_records(result['articles'])
	df['CHANNEL'] = channel
	df['ACCOUNT'] = account

	# Order the data so it's the same accross all files
	df = df[['A_NR', 'A_ID', 'A_STOCK', 'A_VK', 'CHANNEL', 'ACCOUNT']]

	df.to_csv(output_file, index=False)


	# TODO: turn this into a script that takes 2 commandline arguments;
	# 1. starting directory - used as root from where all the subdirectories of format
	# /1009_outfit/tbcat/out_save/2018/03/05 are situated
	# 2. output directory - where all the output files are saved
	# NOTE: keep track on what the current account is (e.g. 1009) so it can be dynamically set in filename and output file

	# for every input file found, run this:
	original_filename = os.path.basename(input_file)
	output_file = get_output_filename(account, original_filename)
	write_file(input_file, output_file, account)
	# supports a unix path with directories included
	# e.g. backup/1009_outfit/tbcat/out_save/2018/03/05/zade_30_cat_20180305_224136.xml