gcetusic · March 6, 2018 15:57 · Mar 6, 2018
diff --git a/tbcat2csv.py b/tbcat2csv.py
@@ -0,0 +1,89 @@
+import xmldataset
+import pandas as pd
+import re
+import os
+from datetime import datetime
+
+# TODO: make this dynamic in the script that iterates over files in folders
+input_file = 'Downloads/zade_30_cat_20180306_000530.xml'
+account = '1009'
+#################
+
+
+# profile for xmldataset library - should not be changed
+profile = """
+TBCATALOG
+    PRODUCTDATA
+        PRODUCT
+            P_CATEGORIES
+                P_CATEGORY = external_dataset:product_information
+            ARTICLEDATA
+                ARTICLE
+                    A_NR = dataset:articles
+                    A_ID = dataset:articles
+                    A_STOCK = dataset:articles
+                    A_PRICEDATA
+                        A_PRICE
+                            A_VK = dataset:articles
+                    __EXTERNAL_VALUE__ = product_information:P_CATEGORY:articles
+"""
+
+def is_stealth_file(filename):
+    # used to check if file should be processed
+    # e.g. zade_30_cat_20180305_224136_stealth.xml is a stealth file
+    res = re.search('.+_stealth.+\.xml', filename)
+
+    if res is None:
+        return False
+
+    return True
+
+
+def get_channel_from_filename(filename):
+    # extract the channel from the filename
+    # e.g. zade_30_cat_20180305_224136.xml has channel 30
+    return re.search('zade_(?P<channel>\d+)_.*', filename).group('channel')
+
+
+def get_output_filename(account, original_filename, timestring=None):
+    # generates the filename of the output csv file
+    # note that if timestring is not specified, the current time is used
+    # e.g. <DATE 2018-03-05>_<ACCOUNT 1009>_<original filename>.csv
+    if timestring is None:
+        timestring = datetime.now().strftime('%Y-%m-%d')
+
+    return '{timestring}_{account}_{original_filename}.csv'.format(
+            timestring=timestring,
+            account=account,
+            original_filename=original_filename)
+
+
+def write_file(input_file, output_file, account):
+    xml = open(input_file).read()
+    result = xmldataset.parse_using_profile(xml, profile)
+
+    filename = os.path.basename(input_file)
+    channel = get_channel_from_filename(filename)
+
+    df = pd.DataFrame.from_records(result['articles'])
+    df['CHANNEL'] = channel
+    df['ACCOUNT'] = account
+
+    # Order the data so it's the same accross all files
+    df = df[['A_NR', 'A_ID', 'A_STOCK', 'A_VK', 'CHANNEL', 'ACCOUNT']]
+
+    df.to_csv(output_file, index=False)
+
+
+# TODO: turn this into a script that takes 2 commandline arguments;
+# 1. starting directory - used as root from where all the subdirectories of format
+#    /1009_outfit/tbcat/out_save/2018/03/05 are situated
+# 2. output directory - where all the output files are saved
+# NOTE: keep track on what the current account is (e.g. 1009) so it can be dynamically set in filename and output file
+
+# for every input file found, run this:
+original_filename =  os.path.basename(input_file)
+output_file = get_output_filename(account, original_filename)
+write_file(input_file, output_file, account) 
+# supports a unix path with directories included
+# e.g.  backup/1009_outfit/tbcat/out_save/2018/03/05/zade_30_cat_20180305_224136.xml