Skip to content

Instantly share code, notes, and snippets.

@TheRockStarDBA
Forked from notareverser/histogram.py
Created July 7, 2022 18:47
Show Gist options
  • Save TheRockStarDBA/dc0423fbf49322927ed05bf6b32e2d37 to your computer and use it in GitHub Desktop.
Save TheRockStarDBA/dc0423fbf49322927ed05bf6b32e2d37 to your computer and use it in GitHub Desktop.

Revisions

  1. @notareverser notareverser created this gist Jul 7, 2022.
    91 changes: 91 additions & 0 deletions histogram.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,91 @@
    #!/usr/bin/env python3


    import argparse
    import sys
    import mmap
    import logging

    from collections import defaultdict

    logging.basicConfig( level=logging.ERROR,
    format='%(asctime)s %(levelname)-8s %(message)s',
    datefmt='%Y-%m-%dT%H:%M:%S',
    handlers={logging.StreamHandler(sys.stderr)})


    def convertToNumber(values, endianness):
    result = 0
    if endianness == 'big': values = values[::-1]
    for v in values: result = (result<<8)+v
    return result


    def produceFrequencies(filename, args):

    frequencies = defaultdict(int)

    # probably need to support giant files
    __fileHandle = open(filename, 'rb')
    fileHandle = mmap.mmap(__fileHandle.fileno(), 0, access=mmap.PROT_READ)
    fileSize = fileHandle.size()
    ngramSize = args.size
    numValues = int(fileSize/ngramSize)
    remainder = fileSize % ngramSize
    if remainder != 0: logging.warning('Ignoring {:d} bytes at the end'.format(remainder))

    for x in range(numValues):
    nextBytes = fileHandle[(x*ngramSize):((x+1)*ngramSize)]
    nextVal = convertToNumber(nextBytes, args.endianness)
    frequencies[nextVal] += 1

    values = frequencies.items()

    if args.sort_values:
    values = sorted(values, key=lambda x: x[1], reverse=True)
    outputPadder = ngramSize*2
    outputType = None
    if args.display_type == 'hex': outputType = 'x'
    elif args.display_type == 'decimal': outputType = 'd'
    else: logging.error("Invalid output type, defaulting to hex")


    outputFormatter = '{:0'+str(len(str(fileSize)))+'d} {:0'+'{:d}'.format(outputPadder)+outputType+'}'
    for value, freq in values:
    print(outputFormatter.format(freq, value))



    def parseArguments():

    parser = argparse.ArgumentParser(description="Arguments for script")
    parser.add_argument('files', nargs='+')
    parser.add_argument('-S', '--sort_values', action='store_true', default=False, help='If specified, sort the output by frequency')
    parser.add_argument('-d', '--display_type', action='store', default='hex', choices=['hex', 'decimal'], help='Specify the output format (hex or decimal, NO OCTAL FOR YOU')
    parser.add_argument('-s', '--size', action='store', type=int, default=1, help='Specifies the number of bytes to compute frequencies for')
    parser.add_argument('-e', '--endianness', action='store', default='little', choices=['little', 'big'], help='Specify the endianness to compute multi-byte values (default is little endian)')
    parser.add_argument('-v', '--verbose', action='store', default=None, help='If specified, output verbose input')

    args = parser.parse_args()

    if args.verbose != None:
    newLevel = getattr(logging, args.verbose.upper(), None)
    if isinstance(newLevel, int):
    logging.getLogger().setLevel(newLevel)


    return args



    def main():

    args = parseArguments()

    for f in args.files:
    produceFrequencies(f, args)


    if __name__ == '__main__':
    main()