Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
e0b266b
remove 64-bit only constraint
giampaolo Mar 21, 2018
c7a0178
use specific magic ver for win
giampaolo Mar 21, 2018
cff5077
rename fun
giampaolo Mar 21, 2018
4487233
try upx=False
giampaolo Mar 22, 2018
fe26654
make flake8 happy
giampaolo Mar 22, 2018
cf2fcdc
remove lib magic on windows as 32 bit version is not supported
giampaolo Mar 22, 2018
84729c2
disable libmagic on windows; I just can't manage to make it work with…
giampaolo Mar 23, 2018
4e1960e
disable failing win test
giampaolo Mar 23, 2018
32e21d9
add mime logic in a new module
giampaolo Mar 26, 2018
4b7600a
refactoring
giampaolo Mar 26, 2018
1e0de8b
add make test-by-name
giampaolo Mar 26, 2018
c165684
treat source files as text/plain mime
giampaolo Mar 26, 2018
f848a90
refactoring
giampaolo Mar 26, 2018
8f306f5
add comment
giampaolo Mar 26, 2018
cd181b9
add magic wrapper module
giampaolo Mar 26, 2018
a558323
add from_buffer implementation
giampaolo Mar 26, 2018
12e9ebd
catch errors around puremagic
giampaolo Mar 26, 2018
2b8ab29
expose from_buffer()
giampaolo Mar 26, 2018
77da4a0
remove libmagic related files
giampaolo Mar 27, 2018
e527673
fix linux tests
giampaolo Mar 27, 2018
97939ee
fix test
giampaolo Mar 27, 2018
6426956
MRO refactoring
giampaolo Mar 27, 2018
d5bf2db
MRO refactoring
giampaolo Mar 27, 2018
5369e46
MRO refactoring
giampaolo Mar 27, 2018
e505943
write content tests
giampaolo Mar 27, 2018
dd99067
write content tests
giampaolo Mar 27, 2018
57229c3
lint
giampaolo Mar 27, 2018
c9da843
fix test
giampaolo Mar 27, 2018
752a76e
guess HTML
giampaolo Mar 27, 2018
aa9ab24
guess HTML type
giampaolo Mar 27, 2018
f327def
add comment
giampaolo Mar 27, 2018
19ae09e
remove dead code
giampaolo Mar 27, 2018
d29250e
update py ver on appveyor
giampaolo Mar 27, 2018
ee82c97
fix broken import
giampaolo Mar 27, 2018
f606957
handle empty file
giampaolo Mar 27, 2018
9d4ea6c
small refactoring
giampaolo Mar 27, 2018
8da5204
assume bin if file is empty
giampaolo Mar 27, 2018
836e381
fix travis failure
giampaolo Mar 27, 2018
8eb0b08
try to fix pyinstaller on appveyor
giampaolo Mar 27, 2018
4645898
Merge branch 'master' into puremagic
btimby Aug 24, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
PYTHON = python3
ARGS =

# In not in a virtualenv, add --user options for install commands.
INSTALL_OPTS = `$(PYTHON) -c "import sys; print('' if hasattr(sys, 'real_prefix') else '--user')"`
Expand All @@ -21,9 +22,13 @@ SYSDEPS = \
TEST_PREFIX = PYTHONWARNINGS=all FULLTEXT_TESTING=1

test: ## Run tests.
${MAKE} install-git-hooks
${MAKE} install
$(TEST_PREFIX) $(PYTHON) fulltext/test/__init__.py

test-by-name: ## Run a specific test by name.
${MAKE} install
$(TEST_PREFIX) $(PYTHON) -m unittest -v $(ARGS)

ci: ## Run CI tests.
${MAKE} sysdeps
${MAKE} pydeps
Expand Down
10 changes: 3 additions & 7 deletions appveyor.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,9 @@ build: false

environment:
matrix:
- PYTHON: "C:\\Python35-x64"
PYTHON_VERSION: "3.5.x"
PYTHON_ARCH: "64"
ARCH: x86_64
VS_VER: "2015"
INSTANCENAME: "SQL2012SP1"

- PYTHON: "C:\\Python36"
PYTHON_VERSION: "3.6.x"
PYTHON_ARCH: "32"
init:
- "ECHO %PYTHON% %PYTHON_VERSION% %PYTHON_ARCH%"

Expand Down
288 changes: 16 additions & 272 deletions fulltext/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,21 @@
import re
import logging
import os
import mimetypes
import sys

from os.path import splitext

from six import string_types
from six import PY3
from fulltext.util import warn
from fulltext.util import magic
from fulltext.util import is_file_path
import fulltext.mimewrap
from fulltext.magicwrap import magic
from fulltext.mimewrap import ext_to_mimetype
from fulltext.mimewrap import mimetype_to_backend
from fulltext.util import fobj_to_tempfile
from fulltext.util import is_file_path
from fulltext.util import is_windows
from fulltext.util import warn


__all__ = ["get", "register_backend"]

Expand All @@ -32,77 +35,8 @@
LOGGER.addHandler(logging.NullHandler())
STRIP_WHITE = re.compile(r'[ \t\v\f\r\n]+')
SENTINAL = object()
MIMETYPE_TO_BACKENDS = {}
EXTS_TO_MIMETYPES = {}
MAGIC_BUFFER_SIZE = 1024

mimetypes.init()
_MIMETYPES_TO_EXT = dict([(v, k) for k, v in mimetypes.types_map.items()])

# A list of extensions which will be treated as pure text.
# This takes precedence over register_backend().
# https://www.openoffice.org/dev_docs/source/file_extensions.html
_TEXT_EXTS = set((
".asm", # Non-UNIX assembler source file
".asp", # Active Server Page
".awk", # An awk script file
".bat", # MS-DOS batch file
".c", # C language file
".class", # Compiled java source code file
".cmd", # Compiler command file
".cpp", # C++ language file
".cxx", # C++ language file
".def", # Win32 library definition file
".dpc", # Source dependency file containing list of dependencies
".dpj", # Java source dependency file containing list of dependencies
".h", # C header file
".hpp", # Generated C++ header or header plus plus file
".hrc", # An ".src", # include header file
".hxx", # C++ header file
".in",
".inc", # Include file
".ini", # Initialization file
".inl", # Inline header file
".jar", # Java classes archive file
".java", # Java language file
".js", # JavaScript code file
".jsp", # Java Server Page file
".kdelnk", # KDE1 configuration file
".l", # Lex source code file
".ll", # Lex source code file
".lnx", # Linux-specific makefile
".log", # Log file
".lst", # ASCII database file used in solenv
".MacOS",
".md", # Markdown language.
".mk", # A dmake makefile
".mod", # BASIC module file
".par", # Script particles file
".pl", # Perl script
".plc", # Former build script file, now obsolete
".pld", # Former build script file, now obsolete
".pm", # Perl module file
".pmk", # Project makefiles
".pre", # Preprocessor output from scpcomp
".py", # Python
".pyx", # Cython
".r", # Resource file for Macintosh
".rc", # A dmake recursive makefile or a Win32 resource script file
".rdb", # Interface and type description database (type library)
".res", # Resource file
".rst", # Restructured text
".s", # Assembler source file (UNIX)
".sbl", # BASIC file
".scp", # Script source file
".sh", # Shell script
".src", # Source resource string file
".txt", # Language text file
".y", # Yacc source code file
".yaml", # Yaml
".yml", # Yaml
".yxx", # Bison source code file
))

register_backend = fulltext.mimewrap.register_backend

# XXX: dirty hack for pyinstaller so that it includes these modules.
# TODO: find a way to do this in pyinstaller.spec instead.
Expand Down Expand Up @@ -134,193 +68,6 @@
from fulltext.backends import __zip # NOQA


# =====================================================================
# --- backends
# =====================================================================


def register_backend(mimetype, module, extensions=None):
"""Register a backend.
`mimetype`: a mimetype string (e.g. 'text/plain')
`module`: an import string (e.g. path.to.my.module)
`extensions`: a list of extensions (e.g. ['txt', 'text'])
"""
if mimetype in MIMETYPE_TO_BACKENDS:
warn("overwriting %r mimetype which was already set" % mimetype)
MIMETYPE_TO_BACKENDS[mimetype] = module
if extensions is None:
try:
ext = _MIMETYPES_TO_EXT[mimetype]
except KeyError:
raise KeyError(
"mimetypes module has no extension associated "
"with %r mimetype; use 'extensions' arg yourself" % mimetype)
assert ext, ext
EXTS_TO_MIMETYPES[ext] = mimetype
else:
if not isinstance(extensions, (list, tuple, set, frozenset)):
raise TypeError("invalid extensions type (got %r)" % extensions)
for ext in set(extensions):
ext = ext if ext.startswith('.') else '.' + ext
assert ext, ext
EXTS_TO_MIMETYPES[ext] = mimetype


register_backend(
'application/zip',
'fulltext.backends.__zip',
extensions=[".zip"])

register_backend(
'application/x-rar-compressed',
'fulltext.backends.__rar',
extensions=['.rar'])

for mt in ("text/xml", "application/xml", "application/x-xml"):
register_backend(
mt,
'fulltext.backends.__xml',
extensions=[".xml", ".xsd"])

register_backend(
'application/vnd.ms-excel',
'fulltext.backends.__xlsx',
extensions=['.xls', '.xlsx'])

register_backend(
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'fulltext.backends.__xlsx',
extensions=['.xlsx'])

register_backend(
'text/plain',
'fulltext.backends.__text',
extensions=['.txt', '.text'])

register_backend(
'application/rtf',
'fulltext.backends.__rtf',
extensions=['.rtf'])

register_backend(
'application/vnd.openxmlformats-officedocument.presentationml.presentation', # NOQA
'fulltext.backends.__pptx',
extensions=['.pptx'])

register_backend(
'application/pdf',
'fulltext.backends.__pdf',
extensions=['.pdf'])

register_backend(
'application/vnd.oasis.opendocument.text',
'fulltext.backends.__odt',
extensions=['.odt'])

register_backend(
'application/vnd.oasis.opendocument.spreadsheet',
'fulltext.backends.__odt',
extensions=['.ods'])

# images
register_backend(
'image/jpeg',
'fulltext.backends.__ocr',
extensions=['.jpg', '.jpeg'])

register_backend(
'image/bmp',
'fulltext.backends.__ocr',
extensions=['.bmp'])

register_backend(
'image/png',
'fulltext.backends.__ocr',
extensions=['.png'])

register_backend(
'image/gif',
'fulltext.backends.__ocr',
extensions=['.gif'])

register_backend(
'application/x-hwp',
'fulltext.backends.__hwp',
extensions=['.hwp'])

for mt in ('text/html', 'application/html', 'text/xhtml'):
register_backend(
mt,
'fulltext.backends.__html',
extensions=['.htm', '.html', '.xhtml'])

register_backend(
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'fulltext.backends.__docx',
extensions=['.docx'])

register_backend(
'application/msword',
'fulltext.backends.__doc',
extensions=['.doc'])

for mt in ('text/csv', 'text/tsv', 'text/psv'):
register_backend(
mt,
'fulltext.backends.__csv',
extensions=['.csv', '.tsv', '.psv', '.tab'])

for mt in ("application/epub", "application/epub+zip"):
register_backend(
mt,
'fulltext.backends.__epub',
extensions=[".epub"])

register_backend(
'application/postscript',
'fulltext.backends.__ps',
extensions=[".ps", ".eps", ".ai"])

register_backend(
'message/rfc822',
'fulltext.backends.__eml',
extensions=['.eml'])

register_backend(
'application/mbox',
'fulltext.backends.__mbox',
extensions=['.mbox'])

register_backend(
'application/vnd.ms-outlook',
'fulltext.backends.__msg',
extensions=['.msg'])

register_backend(
'application/gzip',
'fulltext.backends.__gz',
extensions=['.gz'])

register_backend(
'application/json',
'fulltext.backends.__json',
extensions=['.json'])

# default backend.
register_backend(
'application/octet-stream',
'fulltext.backends.__bin',
extensions=['.a', '.bin'])

# Extensions which will be treated as pure text.
# We just come up with a custom mime name.
for ext in _TEXT_EXTS:
register_backend(
'[custom-fulltext-mime]/%s' % ext,
'fulltext.backends.__text',
extensions=[ext])


# =====================================================================
# --- utils
# =====================================================================
Expand Down Expand Up @@ -428,7 +175,7 @@ def import_mod(mod_name):
def backend_from_mime(mime):
"""Determine backend module object from a mime string."""
try:
mod_name = MIMETYPE_TO_BACKENDS[mime]
mod_name = mimetype_to_backend(mime)

except KeyError:
msg = "No handler for %r, defaulting to %r" % (mime, DEFAULT_MIME)
Expand All @@ -437,7 +184,7 @@ def backend_from_mime(mime):
else:
LOGGER.debug(msg)

mod_name = MIMETYPE_TO_BACKENDS[DEFAULT_MIME]
mod_name = mimetype_to_backend(DEFAULT_MIME)
mod = import_mod(mod_name)
return mod

Expand All @@ -446,10 +193,8 @@ def backend_from_fname(name):
"""Determine backend module object from a file name."""
ext = splitext(name)[1]

try:
mime = EXTS_TO_MIMETYPES[ext]

except KeyError:
mime = ext_to_mimetype(ext)
if mime is None:
try:
f = open(name, 'rb')

Expand All @@ -466,14 +211,14 @@ def backend_from_fname(name):
else:
LOGGER.debug(msg)

mod_name = MIMETYPE_TO_BACKENDS[DEFAULT_MIME]
mod_name = mimetype_to_backend(DEFAULT_MIME)

else:
with f:
return backend_from_fobj(f)

else:
mod_name = MIMETYPE_TO_BACKENDS[mime]
mod_name = mimetype_to_backend(mime)

mod = import_mod(mod_name)
return mod
Expand Down Expand Up @@ -550,9 +295,8 @@ def _get(path_or_file, default, mime, name, backend, encoding,
backend_mod = backend_from_fobj(path_or_file)
else:
if isinstance(backend, string_types):
try:
mime = EXTS_TO_MIMETYPES['.' + backend]
except KeyError:
mime = ext_to_mimetype('.' + backend)
if mime is None:
raise ValueError("invalid backend %r" % backend)
backend_mod = backend_from_mime(mime)
else:
Expand Down
Binary file added fulltext/data/bin32/exiftool.exe
Binary file not shown.
Binary file removed fulltext/data/bin32/magic1.dll
Binary file not shown.
Empty file modified fulltext/data/bin32/pdftotext.exe
100644 → 100755
Empty file.
Binary file added fulltext/data/bin32/unrar.exe
Binary file not shown.
Binary file added fulltext/data/bin32/unrtf.exe
Binary file not shown.
Empty file modified fulltext/data/bin32/upx.exe
100644 → 100755
Empty file.
Binary file removed fulltext/data/bin64/libgnurx-0.dll
Binary file not shown.
Binary file removed fulltext/data/bin64/magic1.dll
Binary file not shown.
Empty file modified fulltext/data/bin64/pdftotext.exe
100644 → 100755
Empty file.
Empty file modified fulltext/data/bin64/unrar.exe
100644 → 100755
Empty file.
Loading