Skip to content

Instantly share code, notes, and snippets.

@gcetusic
Created March 6, 2018 15:57
Show Gist options
  • Save gcetusic/a62ef19f1aa88f5558b3adf321fa43fd to your computer and use it in GitHub Desktop.
Save gcetusic/a62ef19f1aa88f5558b3adf321fa43fd to your computer and use it in GitHub Desktop.

Revisions

  1. gcetusic created this gist Mar 6, 2018.
    89 changes: 89 additions & 0 deletions tbcat2csv.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,89 @@
    import xmldataset
    import pandas as pd
    import re
    import os
    from datetime import datetime

    # TODO: make this dynamic in the script that iterates over files in folders
    input_file = 'Downloads/zade_30_cat_20180306_000530.xml'
    account = '1009'
    #################


    # profile for xmldataset library - should not be changed
    profile = """
    TBCATALOG
    PRODUCTDATA
    PRODUCT
    P_CATEGORIES
    P_CATEGORY = external_dataset:product_information
    ARTICLEDATA
    ARTICLE
    A_NR = dataset:articles
    A_ID = dataset:articles
    A_STOCK = dataset:articles
    A_PRICEDATA
    A_PRICE
    A_VK = dataset:articles
    __EXTERNAL_VALUE__ = product_information:P_CATEGORY:articles
    """

    def is_stealth_file(filename):
    # used to check if file should be processed
    # e.g. zade_30_cat_20180305_224136_stealth.xml is a stealth file
    res = re.search('.+_stealth.+\.xml', filename)

    if res is None:
    return False

    return True


    def get_channel_from_filename(filename):
    # extract the channel from the filename
    # e.g. zade_30_cat_20180305_224136.xml has channel 30
    return re.search('zade_(?P<channel>\d+)_.*', filename).group('channel')


    def get_output_filename(account, original_filename, timestring=None):
    # generates the filename of the output csv file
    # note that if timestring is not specified, the current time is used
    # e.g. <DATE 2018-03-05>_<ACCOUNT 1009>_<original filename>.csv
    if timestring is None:
    timestring = datetime.now().strftime('%Y-%m-%d')

    return '{timestring}_{account}_{original_filename}.csv'.format(
    timestring=timestring,
    account=account,
    original_filename=original_filename)


    def write_file(input_file, output_file, account):
    xml = open(input_file).read()
    result = xmldataset.parse_using_profile(xml, profile)

    filename = os.path.basename(input_file)
    channel = get_channel_from_filename(filename)

    df = pd.DataFrame.from_records(result['articles'])
    df['CHANNEL'] = channel
    df['ACCOUNT'] = account

    # Order the data so it's the same accross all files
    df = df[['A_NR', 'A_ID', 'A_STOCK', 'A_VK', 'CHANNEL', 'ACCOUNT']]

    df.to_csv(output_file, index=False)


    # TODO: turn this into a script that takes 2 commandline arguments;
    # 1. starting directory - used as root from where all the subdirectories of format
    # /1009_outfit/tbcat/out_save/2018/03/05 are situated
    # 2. output directory - where all the output files are saved
    # NOTE: keep track on what the current account is (e.g. 1009) so it can be dynamically set in filename and output file

    # for every input file found, run this:
    original_filename = os.path.basename(input_file)
    output_file = get_output_filename(account, original_filename)
    write_file(input_file, output_file, account)
    # supports a unix path with directories included
    # e.g. backup/1009_outfit/tbcat/out_save/2018/03/05/zade_30_cat_20180305_224136.xml