#!/usr/bin/env python # coding: utf-8 import os import random import glob from PIL import Image import numpy BLOCK_SIZE = 20 THRESHOLD = 60 WIDTH = 200 MAX_DISTANCE = 220 def image_data(filename): """ Get data from image ready for comparison """ img = Image.open(filename).resize((BLOCK_SIZE, BLOCK_SIZE), Image.BILINEAR) return numpy.array([sum(x) for x in img.getdata()]) def distance(data1, data2): """ Logical distance between two images on a scale 0..400 """ return sum(1 for x in data1 - data2 if abs(x) > THRESHOLD) def duplicates(dirname): """ Finds duplicate images in a directory. All files must be *.jpg. Returns an iterator of image groups ([], [], ... []) """ files = glob.glob(os.path.join(dirname, '*.jpg')) images = [(f, image_data(f)) for f in files] random.shuffle(images) for filename, data in images: distances = [(distance(data, d), f) for f, d in images] yield sorted([(dist, f) for dist, f in distances if dist < MAX_DISTANCE]) def html_group(group): return ''.join( '%s' % (os.path.basename(f), WIDTH, dist) for dist, f in group ) def html(groups): """ Generates HTML from groups of image duplicates """ body = '
'.join(html_group(g) for g in groups) return '%s
' % body if __name__ == '__main__': print(html(duplicates('/home/maniac/Desktop/4554182')))