#!/usr/bin/env python
# coding: utf-8
import os
import random
import glob
from PIL import Image
import numpy
BLOCK_SIZE = 20
THRESHOLD = 60
WIDTH = 200
MAX_DISTANCE = 220
def image_data(filename):
"""
Get data from image ready for comparison
"""
img = Image.open(filename).resize((BLOCK_SIZE, BLOCK_SIZE), Image.BILINEAR)
return numpy.array([sum(x) for x in img.getdata()])
def distance(data1, data2):
"""
Logical distance between two images on a scale 0..400
"""
return sum(1 for x in data1 - data2 if abs(x) > THRESHOLD)
def duplicates(dirname):
"""
Finds duplicate images in a directory.
All files must be *.jpg.
Returns an iterator of image groups ([], [], ... [])
"""
files = glob.glob(os.path.join(dirname, '*.jpg'))
images = [(f, image_data(f)) for f in files]
random.shuffle(images)
for filename, data in images:
distances = [(distance(data, d), f) for f, d in images]
yield sorted([(dist, f) for dist, f in distances if dist < MAX_DISTANCE])
def html_group(group):
return ''.join(
'%s' % (os.path.basename(f), WIDTH, dist)
for dist, f in group
)
def html(groups):
"""
Generates HTML from groups of image duplicates
"""
body = '