|
|
@@ -0,0 +1,91 @@ |
|
|
#!/usr/bin/env python3 |
|
|
|
|
|
|
|
|
import argparse |
|
|
import sys |
|
|
import mmap |
|
|
import logging |
|
|
|
|
|
from collections import defaultdict |
|
|
|
|
|
logging.basicConfig( level=logging.ERROR, |
|
|
format='%(asctime)s %(levelname)-8s %(message)s', |
|
|
datefmt='%Y-%m-%dT%H:%M:%S', |
|
|
handlers={logging.StreamHandler(sys.stderr)}) |
|
|
|
|
|
|
|
|
def convertToNumber(values, endianness): |
|
|
result = 0 |
|
|
if endianness == 'big': values = values[::-1] |
|
|
for v in values: result = (result<<8)+v |
|
|
return result |
|
|
|
|
|
|
|
|
def produceFrequencies(filename, args): |
|
|
|
|
|
frequencies = defaultdict(int) |
|
|
|
|
|
# probably need to support giant files |
|
|
__fileHandle = open(filename, 'rb') |
|
|
fileHandle = mmap.mmap(__fileHandle.fileno(), 0, access=mmap.PROT_READ) |
|
|
fileSize = fileHandle.size() |
|
|
ngramSize = args.size |
|
|
numValues = int(fileSize/ngramSize) |
|
|
remainder = fileSize % ngramSize |
|
|
if remainder != 0: logging.warning('Ignoring {:d} bytes at the end'.format(remainder)) |
|
|
|
|
|
for x in range(numValues): |
|
|
nextBytes = fileHandle[(x*ngramSize):((x+1)*ngramSize)] |
|
|
nextVal = convertToNumber(nextBytes, args.endianness) |
|
|
frequencies[nextVal] += 1 |
|
|
|
|
|
values = frequencies.items() |
|
|
|
|
|
if args.sort_values: |
|
|
values = sorted(values, key=lambda x: x[1], reverse=True) |
|
|
outputPadder = ngramSize*2 |
|
|
outputType = None |
|
|
if args.display_type == 'hex': outputType = 'x' |
|
|
elif args.display_type == 'decimal': outputType = 'd' |
|
|
else: logging.error("Invalid output type, defaulting to hex") |
|
|
|
|
|
|
|
|
outputFormatter = '{:0'+str(len(str(fileSize)))+'d} {:0'+'{:d}'.format(outputPadder)+outputType+'}' |
|
|
for value, freq in values: |
|
|
print(outputFormatter.format(freq, value)) |
|
|
|
|
|
|
|
|
|
|
|
def parseArguments(): |
|
|
|
|
|
parser = argparse.ArgumentParser(description="Arguments for script") |
|
|
parser.add_argument('files', nargs='+') |
|
|
parser.add_argument('-S', '--sort_values', action='store_true', default=False, help='If specified, sort the output by frequency') |
|
|
parser.add_argument('-d', '--display_type', action='store', default='hex', choices=['hex', 'decimal'], help='Specify the output format (hex or decimal, NO OCTAL FOR YOU') |
|
|
parser.add_argument('-s', '--size', action='store', type=int, default=1, help='Specifies the number of bytes to compute frequencies for') |
|
|
parser.add_argument('-e', '--endianness', action='store', default='little', choices=['little', 'big'], help='Specify the endianness to compute multi-byte values (default is little endian)') |
|
|
parser.add_argument('-v', '--verbose', action='store', default=None, help='If specified, output verbose input') |
|
|
|
|
|
args = parser.parse_args() |
|
|
|
|
|
if args.verbose != None: |
|
|
newLevel = getattr(logging, args.verbose.upper(), None) |
|
|
if isinstance(newLevel, int): |
|
|
logging.getLogger().setLevel(newLevel) |
|
|
|
|
|
|
|
|
return args |
|
|
|
|
|
|
|
|
|
|
|
def main(): |
|
|
|
|
|
args = parseArguments() |
|
|
|
|
|
for f in args.files: |
|
|
produceFrequencies(f, args) |
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
main() |
|
|
|