Skip to content

Instantly share code, notes, and snippets.

@jjjake
Last active March 16, 2017 23:15
Show Gist options
  • Save jjjake/28eab391c97a2ea2691c074447e99b09 to your computer and use it in GitHub Desktop.
Save jjjake/28eab391c97a2ea2691c074447e99b09 to your computer and use it in GitHub Desktop.

Revisions

  1. jjjake revised this gist Mar 16, 2017. 1 changed file with 6 additions and 5 deletions.
    11 changes: 6 additions & 5 deletions audit_gb_shipment.py
    Original file line number Diff line number Diff line change
    @@ -18,9 +18,10 @@ def get_gb_counts(tsv):
    return counts


    def audit_shipment(metadata_file, counts):
    """
    BEFORE running this script, Generate a JSONL document of all metadata for
    def audit_shipment(metadata_file, gb_counts):
    """Audit shipment with counts from GB.
    BEFORE running this script, Generate a JSONL document of all metadata for
    all items for the given shipment:
    ia search 'collection:georgeblood AND shiptracking:15446' -p scope:all -i \
    @@ -34,8 +35,8 @@ def audit_shipment(metadata_file, counts):
    tifs = len([f for f in j['files'] if f['format'] == 'TIFF'])
    barcode = j['metadata']['collection-catalog-number'].lower()

    assert flacs == counts[barcode]['flacs']
    assert tifs == counts[barcode]['tifs']
    assert flacs == gb_counts[barcode]['flacs']
    assert tifs == gb_counts[barcode]['tifs']

    print('success, audit complete!')

  2. jjjake revised this gist Mar 16, 2017. 1 changed file with 2 additions and 2 deletions.
    4 changes: 2 additions & 2 deletions audit_gb_shipment.py
    Original file line number Diff line number Diff line change
    @@ -18,7 +18,7 @@ def get_gb_counts(tsv):
    return counts


    def audit_shipment(metadata_file):
    def audit_shipment(metadata_file, counts):
    """
    BEFORE running this script, Generate a JSONL document of all metadata for
    all items for the given shipment:
    @@ -42,4 +42,4 @@ def audit_shipment(metadata_file):

    if __name__ == '__main__':
    counts = get_gb_counts('/Users/archive/78rpm-utils/spreadsheets/iArchiveExport_FileCount_20170315.tsv')
    audit_shipment('15446.jsonl')
    audit_shipment('15446.jsonl', counts)
  3. jjjake revised this gist Mar 16, 2017. 1 changed file with 2 additions and 1 deletion.
    3 changes: 2 additions & 1 deletion audit_gb_shipment.py
    Original file line number Diff line number Diff line change
    @@ -20,7 +20,8 @@ def get_gb_counts(tsv):

    def audit_shipment(metadata_file):
    """
    Generate a JSONL document of all metadata for all items for the given shipment:
    BEFORE running this script, Generate a JSONL document of all metadata for
    all items for the given shipment:
    ia search 'collection:georgeblood AND shiptracking:15446' -p scope:all -i \
    | parallel 'ia md {}' \
  4. jjjake created this gist Mar 16, 2017.
    44 changes: 44 additions & 0 deletions audit_gb_shipment.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,44 @@
    import json


    def get_gb_counts(tsv):
    counts = dict()
    for line in open(tsv):
    barcode = line.split('\t')[0].lower()
    # Skip header row.
    if barcode == 'barcode':
    continue
    # Add 1 to GB count because we make a copy of the best version.
    flacs = int(line.split('\t')[1]) + 1
    tifs = int(line.split('\t')[2])

    # i.e. {'GBIA00000X': {'flacs': 9, 'tifs': 1}}
    counts[barcode] = dict(flacs=flacs, tifs=tifs)

    return counts


    def audit_shipment(metadata_file):
    """
    Generate a JSONL document of all metadata for all items for the given shipment:
    ia search 'collection:georgeblood AND shiptracking:15446' -p scope:all -i \
    | parallel 'ia md {}' \
    | pv -acbrl > 15446.jsonl
    """

    for line in open(metadata_file):
    j = json.loads(line.strip())
    flacs = len([f for f in j['files'] if f['name'].endswith('.flac')])
    tifs = len([f for f in j['files'] if f['format'] == 'TIFF'])
    barcode = j['metadata']['collection-catalog-number'].lower()

    assert flacs == counts[barcode]['flacs']
    assert tifs == counts[barcode]['tifs']

    print('success, audit complete!')


    if __name__ == '__main__':
    counts = get_gb_counts('/Users/archive/78rpm-utils/spreadsheets/iArchiveExport_FileCount_20170315.tsv')
    audit_shipment('15446.jsonl')