From 9997dab79134121a82ad7a2cc0c0fd4f8da99fce Mon Sep 17 00:00:00 2001 From: Mirko Scholz Date: Sun, 9 Jan 2022 23:25:52 +0100 Subject: [PATCH] initial python binding --- bindings/python/README.md | 20 +++++++++ bindings/python/phash.pxd | 8 ++++ bindings/python/phash.pyx | 57 ++++++++++++++++++++++++++ bindings/python/phash_add_directory.py | 53 ++++++++++++++++++++++++ bindings/python/phash_show_similar.py | 57 ++++++++++++++++++++++++++ bindings/python/setup.py | 12 ++++++ 6 files changed, 207 insertions(+) create mode 100644 bindings/python/README.md create mode 100644 bindings/python/phash.pxd create mode 100644 bindings/python/phash.pyx create mode 100644 bindings/python/phash_add_directory.py create mode 100644 bindings/python/phash_show_similar.py create mode 100644 bindings/python/setup.py diff --git a/bindings/python/README.md b/bindings/python/README.md new file mode 100644 index 0000000..e526660 --- /dev/null +++ b/bindings/python/README.md @@ -0,0 +1,20 @@ +# Python binding to use pHash on images + +## Local use + +Check that all dependencies are available and create the header pHash.h: + +``` +mkdir build-phash +cd build-phash +cmake ../../../ +cd .. +``` + +Build the python extension in the current directory: + +``` +python setup.py build_ext --inplace +``` + + diff --git a/bindings/python/phash.pxd b/bindings/python/phash.pxd new file mode 100644 index 0000000..8f99ce3 --- /dev/null +++ b/bindings/python/phash.pxd @@ -0,0 +1,8 @@ +# distutils: language=c++ +from libc.stdint cimport uint8_t, uint64_t + +cdef extern from "../../src/pHash.h": + uint8_t* ph_mh_imagehash(const char *, int&, float, float) + double ph_hammingdistance2(uint8_t*, int, uint8_t*, int) + int ph_dct_imagehash(const char *path, uint64_t&) + int ph_hamming_distance(uint64_t hash1, uint64_t hash1) diff --git a/bindings/python/phash.pyx b/bindings/python/phash.pyx new file mode 100644 index 0000000..8796951 --- /dev/null +++ b/bindings/python/phash.pyx @@ -0,0 +1,57 @@ +# distutils: language=c++ +import os +from libc.stdlib cimport free +from libc.stdint cimport uint8_t, uint64_t + +from phash cimport (ph_mh_imagehash, ph_hammingdistance2, + ph_dct_imagehash, ph_hamming_distance) + + +cdef class MHImageHash: + cdef bytes c_hash + + def __getstate__(self): + return self.c_hash + + def __setstate__(self, state): + self.c_hash = state + + def hamming_distance(self, MHImageHash other): + return int(round(100 * ph_hammingdistance2(self.c_hash, len(self.c_hash), other.c_hash, len(other.c_hash)))) + + @staticmethod + def from_path(path, alpha=2.0, level=1.0): + cdef uint8_t* c_hash + cdef int c_hashlen = 0 + c_hash = ph_mh_imagehash(os.fsencode(path), c_hashlen, alpha, level) + if c_hash is NULL: + raise RuntimeError("ph_mh_imagehash failed") + else: + obj = MHImageHash() + try: + obj.c_hash = c_hash[:c_hashlen] + finally: + free(c_hash) + return obj + + +cdef class DCTImageHash: + cdef uint64_t c_hash + + def __getstate__(self): + return self.c_hash + + def __setstate__(self, state): + self.c_hash = state + + def hamming_distance(self, DCTImageHash other): + return ph_hamming_distance(self.c_hash, other.c_hash) + + @staticmethod + def from_path(path): + obj = DCTImageHash() + if ph_dct_imagehash(os.fsencode(path), obj.c_hash) == 0: + return obj + else: + raise RuntimeError("ph_dct_imagehash failed") + diff --git a/bindings/python/phash_add_directory.py b/bindings/python/phash_add_directory.py new file mode 100644 index 0000000..724d200 --- /dev/null +++ b/bindings/python/phash_add_directory.py @@ -0,0 +1,53 @@ +""" +Hash all images in the specified directory +""" +import os +import pickle +import argparse +from pathlib import Path +from multiprocessing import Pool, cpu_count +from phash import DCTImageHash, MHImageHash + + +def dct_hash_image(path): + print(path, flush=True) + return DCTImageHash.from_path(Path(path)) + + +def mh_hash_image(path): + print(path, flush=True) + return MHImageHash.from_path(Path(path)) + + +def parse_args(): + p = argparse.ArgumentParser() + p.add_argument('directory') + p.add_argument('cache_file', default='phash.cache', nargs='?') + p.add_argument('-e', '--extension', default='.jpg') + p.add_argument('-m', '--hash-method', default='dct', choices=('dct', 'mh')) + return p.parse_args() + + +def main(): + opts = parse_args() + if not os.path.isdir(opts.directory): + raise RuntimeError('directory %s does not exist' % opts.directory) + image_files = [os.path.join(opts.directory, p) for p in os.listdir(opts.directory) if p.endswith(opts.extension)] + with Pool(processes=cpu_count()) as p: + if opts.hash_method == 'dct': + hashes = p.map(dct_hash_image, image_files) + elif opts.hash_method == 'mh': + hashes = p.map(mh_hash_image, image_files) + else: + raise RuntimeError('unknown hash method %s' % opts.hash_method) + if os.path.exists(opts.cache_file): + with open(opts.cache_file, 'rb') as fin: + image_files_in, hashes_in = pickle.load(fin) + image_files += image_files_in + hashes += hashes_in + with open(opts.cache_file, 'wb') as fout: + pickle.dump((image_files, hashes), fout, pickle.HIGHEST_PROTOCOL) + + +if __name__ == '__main__': + main() diff --git a/bindings/python/phash_show_similar.py b/bindings/python/phash_show_similar.py new file mode 100644 index 0000000..0ee2ebb --- /dev/null +++ b/bindings/python/phash_show_similar.py @@ -0,0 +1,57 @@ +""" +Print a list of similar pictures +optionally delete it (maybe in a second pass) +""" +import os +import sys +import argparse +import pickle + + +def parse_args(): + p = argparse.ArgumentParser() + p.add_argument('cache_file') + p.add_argument('-t', '--threshold', type=int, default=10) + p.add_argument('-d', '--delete-threshold', type=float) + return p.parse_args() + + +def main(): + opts = parse_args() + with open(opts.cache_file, 'rb') as fin: + image_files, hashes = pickle.load(fin) + print('hashed', len(hashes), 'files with', hashes[0].__class__.__name__, file=sys.stderr) + + print('', opts.cache_file, '') + print('') + print('') + print('') + for i in range(opts.threshold): + print('' % i) + print('') + for i1, h1 in enumerate(hashes): + similar = {} + for i2, h2 in enumerate(hashes[i1+1:], start=i1+1): + distance = h1.hamming_distance(h2) + if opts.delete_threshold is not None and distance < opts.delete_threshold: + print('deleting file', image_files[i2], file=sys.stderr) + os.unlink(image_files[i2]) + elif distance < opts.threshold: + similar.setdefault(distance, []).append(i2) + if len(similar): + print('') + print('' % (image_files[i1], i1)) + for i in range(opts.threshold): + if i in similar: + print('') + else: + print('') + print('') + print('
%d
[%03d]') + for j in similar[i]: + print('[%03d]' % (image_files[j], j)) + print('
') + + +if __name__ == '__main__': + main() diff --git a/bindings/python/setup.py b/bindings/python/setup.py new file mode 100644 index 0000000..c370420 --- /dev/null +++ b/bindings/python/setup.py @@ -0,0 +1,12 @@ +import os +from setuptools import setup +from distutils.extension import Extension +from Cython.Build import cythonize + +phash_ext = Extension('phash', + sources=['phash.pyx', os.path.abspath('../../src/pHash.cpp')], + libraries=['png', 'tiff'], + language='c++') + +setup( + ext_modules=cythonize([phash_ext], language_level='3'))