diff --git a/bindings/python/README.md b/bindings/python/README.md new file mode 100644 index 0000000..e526660 --- /dev/null +++ b/bindings/python/README.md @@ -0,0 +1,20 @@ +# Python binding to use pHash on images + +## Local use + +Check that all dependencies are available and create the header pHash.h: + +``` +mkdir build-phash +cd build-phash +cmake ../../../ +cd .. +``` + +Build the python extension in the current directory: + +``` +python setup.py build_ext --inplace +``` + + diff --git a/bindings/python/phash.pxd b/bindings/python/phash.pxd new file mode 100644 index 0000000..8f99ce3 --- /dev/null +++ b/bindings/python/phash.pxd @@ -0,0 +1,8 @@ +# distutils: language=c++ +from libc.stdint cimport uint8_t, uint64_t + +cdef extern from "../../src/pHash.h": + uint8_t* ph_mh_imagehash(const char *, int&, float, float) + double ph_hammingdistance2(uint8_t*, int, uint8_t*, int) + int ph_dct_imagehash(const char *path, uint64_t&) + int ph_hamming_distance(uint64_t hash1, uint64_t hash1) diff --git a/bindings/python/phash.pyx b/bindings/python/phash.pyx new file mode 100644 index 0000000..8796951 --- /dev/null +++ b/bindings/python/phash.pyx @@ -0,0 +1,57 @@ +# distutils: language=c++ +import os +from libc.stdlib cimport free +from libc.stdint cimport uint8_t, uint64_t + +from phash cimport (ph_mh_imagehash, ph_hammingdistance2, + ph_dct_imagehash, ph_hamming_distance) + + +cdef class MHImageHash: + cdef bytes c_hash + + def __getstate__(self): + return self.c_hash + + def __setstate__(self, state): + self.c_hash = state + + def hamming_distance(self, MHImageHash other): + return int(round(100 * ph_hammingdistance2(self.c_hash, len(self.c_hash), other.c_hash, len(other.c_hash)))) + + @staticmethod + def from_path(path, alpha=2.0, level=1.0): + cdef uint8_t* c_hash + cdef int c_hashlen = 0 + c_hash = ph_mh_imagehash(os.fsencode(path), c_hashlen, alpha, level) + if c_hash is NULL: + raise RuntimeError("ph_mh_imagehash failed") + else: + obj = MHImageHash() + try: + obj.c_hash = c_hash[:c_hashlen] + finally: + free(c_hash) + return obj + + +cdef class DCTImageHash: + cdef uint64_t c_hash + + def __getstate__(self): + return self.c_hash + + def __setstate__(self, state): + self.c_hash = state + + def hamming_distance(self, DCTImageHash other): + return ph_hamming_distance(self.c_hash, other.c_hash) + + @staticmethod + def from_path(path): + obj = DCTImageHash() + if ph_dct_imagehash(os.fsencode(path), obj.c_hash) == 0: + return obj + else: + raise RuntimeError("ph_dct_imagehash failed") + diff --git a/bindings/python/phash_add_directory.py b/bindings/python/phash_add_directory.py new file mode 100644 index 0000000..724d200 --- /dev/null +++ b/bindings/python/phash_add_directory.py @@ -0,0 +1,53 @@ +""" +Hash all images in the specified directory +""" +import os +import pickle +import argparse +from pathlib import Path +from multiprocessing import Pool, cpu_count +from phash import DCTImageHash, MHImageHash + + +def dct_hash_image(path): + print(path, flush=True) + return DCTImageHash.from_path(Path(path)) + + +def mh_hash_image(path): + print(path, flush=True) + return MHImageHash.from_path(Path(path)) + + +def parse_args(): + p = argparse.ArgumentParser() + p.add_argument('directory') + p.add_argument('cache_file', default='phash.cache', nargs='?') + p.add_argument('-e', '--extension', default='.jpg') + p.add_argument('-m', '--hash-method', default='dct', choices=('dct', 'mh')) + return p.parse_args() + + +def main(): + opts = parse_args() + if not os.path.isdir(opts.directory): + raise RuntimeError('directory %s does not exist' % opts.directory) + image_files = [os.path.join(opts.directory, p) for p in os.listdir(opts.directory) if p.endswith(opts.extension)] + with Pool(processes=cpu_count()) as p: + if opts.hash_method == 'dct': + hashes = p.map(dct_hash_image, image_files) + elif opts.hash_method == 'mh': + hashes = p.map(mh_hash_image, image_files) + else: + raise RuntimeError('unknown hash method %s' % opts.hash_method) + if os.path.exists(opts.cache_file): + with open(opts.cache_file, 'rb') as fin: + image_files_in, hashes_in = pickle.load(fin) + image_files += image_files_in + hashes += hashes_in + with open(opts.cache_file, 'wb') as fout: + pickle.dump((image_files, hashes), fout, pickle.HIGHEST_PROTOCOL) + + +if __name__ == '__main__': + main() diff --git a/bindings/python/phash_show_similar.py b/bindings/python/phash_show_similar.py new file mode 100644 index 0000000..0ee2ebb --- /dev/null +++ b/bindings/python/phash_show_similar.py @@ -0,0 +1,57 @@ +""" +Print a list of similar pictures +optionally delete it (maybe in a second pass) +""" +import os +import sys +import argparse +import pickle + + +def parse_args(): + p = argparse.ArgumentParser() + p.add_argument('cache_file') + p.add_argument('-t', '--threshold', type=int, default=10) + p.add_argument('-d', '--delete-threshold', type=float) + return p.parse_args() + + +def main(): + opts = parse_args() + with open(opts.cache_file, 'rb') as fin: + image_files, hashes = pickle.load(fin) + print('hashed', len(hashes), 'files with', hashes[0].__class__.__name__, file=sys.stderr) + + print('
| ') + for i in range(opts.threshold): + print(' | %d | ' % i) + print('|
|---|---|---|
| [%03d] | ' % (image_files[i1], i1)) + for i in range(opts.threshold): + if i in similar: + print('') + for j in similar[i]: + print('[%03d]' % (image_files[j], j)) + print(' | ') + else: + print('') + print(' |