diff --git a/data/BUILD.bazel b/data/BUILD.bazel new file mode 100644 index 000000000..8a4df2dbe --- /dev/null +++ b/data/BUILD.bazel @@ -0,0 +1,17 @@ +package(default_visibility = ["//visibility:public"]) + +genrule( + name = "opencc_resources_zip", + srcs = [ + "//data/config:config", + "//data/dictionary:text_dictionaries", + ], + outs = ["opencc-resources.zip"], + cmd = ( + "$(location //data/scripts:opencc_resources_zip) " + + "--output $@ " + + "--configs $(locations //data/config:config) " + + "--dicts $(locations //data/dictionary:text_dictionaries)" + ), + tools = ["//data/scripts:opencc_resources_zip"], +) diff --git a/data/scripts/BUILD.bazel b/data/scripts/BUILD.bazel index e10388213..eb5e0efdf 100644 --- a/data/scripts/BUILD.bazel +++ b/data/scripts/BUILD.bazel @@ -21,6 +21,11 @@ py_binary( srcs = ["extract_tofu_risk.py"], ) +py_binary( + name = "opencc_resources_zip", + srcs = ["opencc_resources_zip.py"], +) + py_test( name = "common_test", size = "small", diff --git a/data/scripts/opencc_resources_zip.py b/data/scripts/opencc_resources_zip.py new file mode 100644 index 000000000..fe197239d --- /dev/null +++ b/data/scripts/opencc_resources_zip.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 + +import argparse +import json +import os +import time +import zipfile + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Build an OpenCC share archive backed by text dictionaries." + ) + parser.add_argument("--output", required=True) + parser.add_argument("--configs", nargs="+", required=True) + parser.add_argument("--dicts", nargs="+", required=True) + return parser.parse_args() + + +def convert_dict_references(value): + if isinstance(value, dict): + converted = {} + for key, child in value.items(): + if key == "type" and child == "ocd2": + converted[key] = "text" + elif key == "file" and isinstance(child, str) and child.endswith(".ocd2"): + converted[key] = child[:-5] + ".txt" + else: + converted[key] = convert_dict_references(child) + return converted + if isinstance(value, list): + return [convert_dict_references(child) for child in value] + return value + + +def read_text_config(path): + with open(path, encoding="utf-8") as file: + config = json.load(file) + converted = convert_dict_references(config) + return json.dumps(converted, ensure_ascii=False, indent=2) + "\n" + + +def read_clean_dictionary(path): + cleaned = [] + with open(path, encoding="utf-8") as file: + for line in file: + if not line.strip() or line.lstrip().startswith("#"): + continue + cleaned.append(line) + return "".join(cleaned) + + +def write_entry(archive, name, content, date_time): + info = zipfile.ZipInfo(name) + info.date_time = date_time + info.compress_type = zipfile.ZIP_STORED + info.external_attr = 0o644 << 16 + archive.writestr(info, content.encode("utf-8")) + + +def main(): + args = parse_args() + output_dir = os.path.dirname(args.output) + if output_dir: + os.makedirs(output_dir, exist_ok=True) + date_time = time.localtime()[:6] + with zipfile.ZipFile(args.output, "w") as archive: + for path in sorted(args.configs, key=os.path.basename): + write_entry( + archive, os.path.basename(path), read_text_config(path), date_time + ) + for path in sorted(args.dicts, key=os.path.basename): + write_entry( + archive, os.path.basename(path), read_clean_dictionary(path), date_time + ) + + +if __name__ == "__main__": + main() diff --git a/node/cli.js b/node/cli.js index e3f4c6484..42595f221 100755 --- a/node/cli.js +++ b/node/cli.js @@ -87,6 +87,7 @@ function parseArgs(args) { config: 's2t.json', input: null, output: null, + resourceZip: null, includeTofuRiskDictionaries: false, help: false, version: false, @@ -103,6 +104,11 @@ function parseArgs(args) { i += 1; } else if (arg.startsWith('--config=')) { options.config = readInlineOptionValue(arg, '--config'); + } else if (arg === '--resource-zip') { + options.resourceZip = readOptionValue(args, i, arg); + i += 1; + } else if (arg.startsWith('--resource-zip=')) { + options.resourceZip = readInlineOptionValue(arg, '--resource-zip'); } else if (arg === '-i' || arg === '--input') { options.input = readOptionValue(args, i, arg); i += 1; @@ -224,6 +230,7 @@ function main() { try { const converter = new OpenCC(resolveConfigPath(options.config), { includeTofuRiskDictionaries: options.includeTofuRiskDictionaries, + resourceZip: options.resourceZip, }); convertStream(converter, options, (error) => { if (error) { diff --git a/node/opencc.cc b/node/opencc.cc index 9e2e52606..9ff9865e9 100644 --- a/node/opencc.cc +++ b/node/opencc.cc @@ -6,6 +6,7 @@ #include "src/Converter.hpp" #include "src/DictConverter.hpp" #include "src/Exception.hpp" +#include "src/ResourceProvider.hpp" using namespace opencc; @@ -57,6 +58,25 @@ class OpenccBinding : public Napi::ObjectWrap { : Napi::ObjectWrap(info), config_(), converter_() { Napi::Env env = info.Env(); + if (info.Length() >= 3 && info[0].IsString() && info[1].IsString() && + info[2].IsBoolean()) { + // Three-argument mode: + // NewFromFile(configFileName, ZipResourceProvider(resourceZipFileName)). + const std::string configFile = ToUtf8String(info[0]); + const std::string resourceZipFile = ToUtf8String(info[1]); + ConfigLoadOptions options; + options.includeTofuRiskDictionaries = + info[2].As().Value(); + try { + std::shared_ptr provider( + new ZipResourceProvider(resourceZipFile)); + converter_ = config_.NewFromFile(configFile, provider, options); + } catch (opencc::Exception& e) { + Napi::Error::New(env, e.what()).ThrowAsJavaScriptException(); + } + return; + } + if (info.Length() >= 2 && info[0].IsString() && info[1].IsString()) { // Two-argument mode: NewFromString(jsonString, configDirectory) // Used by the JS layer to pass patched JSON with absolute paths. diff --git a/node/opencc.d.cts b/node/opencc.d.cts index a01499d78..a0e875ecf 100644 --- a/node/opencc.d.cts +++ b/node/opencc.d.cts @@ -1,5 +1,6 @@ interface OpenCCOptions { includeTofuRiskDictionaries?: boolean; + resourceZip?: string; } declare class OpenCC { diff --git a/node/opencc.d.mts b/node/opencc.d.mts index ceec363cc..17f4d5c7e 100644 --- a/node/opencc.d.mts +++ b/node/opencc.d.mts @@ -1,5 +1,6 @@ interface OpenCCOptions { includeTofuRiskDictionaries?: boolean; + resourceZip?: string; } declare class OpenCC { diff --git a/node/opencc.d.ts b/node/opencc.d.ts index 223a1020f..07a106e46 100644 --- a/node/opencc.d.ts +++ b/node/opencc.d.ts @@ -1,5 +1,6 @@ interface OpenCCOptions { includeTofuRiskDictionaries?: boolean; + resourceZip?: string; } declare class OpenCC { diff --git a/node/opencc.js b/node/opencc.js index 710545615..367b0e9ea 100644 --- a/node/opencc.js +++ b/node/opencc.js @@ -226,6 +226,15 @@ const OpenCC = module.exports = function (config, options) { const includeTofuRiskDictionaries = options.includeTofuRiskDictionaries !== false; + if (options.resourceZip) { + this.handler = new binding.Opencc( + config, + options.resourceZip, + includeTofuRiskDictionaries + ); + return; + } + // When opencc-jieba is installed, check if the requested config is a jieba // config. If so, load its JSON, patch all paths to absolute, and pass the // patched JSON string directly to the C++ layer via NewFromString. diff --git a/src/Config.cpp b/src/Config.cpp index 8db765da9..2cc988cf0 100644 --- a/src/Config.cpp +++ b/src/Config.cpp @@ -351,11 +351,46 @@ class ConfigInternal { throw FileNotFound(path); } + DictPtr LoadTextMarisaDictWithResourceProvider(const std::string& fileName) { + if (resourceProvider == nullptr) { + throw FileNotFound(fileName); + } + + const std::shared_ptr resource = + resourceProvider->GetResource(fileName); + std::string cacheKey = "text-marisa\n" + resource->CacheKey(); + { + std::lock_guard lock(DictCacheMutex()); + PruneExpiredDictCache(); + const auto cached = DictCache().find(cacheKey); + if (cached != DictCache().end()) { + DictPtr dict = cached->second.lock(); + if (dict != nullptr) { + return dict; + } + } + } + + TextDictPtr textDict = TextDict::NewFromBuffer(resource->Data(), + resource->Size()); + DictPtr dict = MarisaDict::NewFromDict(*textDict.get()); + { + std::lock_guard lock(DictCacheMutex()); + PruneExpiredDictCache(); + std::weak_ptr& cached = DictCache()[cacheKey]; + DictPtr cachedDict = cached.lock(); + if (cachedDict == nullptr) { + cached = dict; + return dict; + } + return cachedDict; + } + } + DictPtr LoadDictFromFile(const std::string& type, const std::string& fileName) { if (type == "text") { - DictPtr dict = LoadDictWithResourceProvider("text", fileName); - return MarisaDict::NewFromDict(*dict.get()); + return LoadTextMarisaDictWithResourceProvider(fileName); } #ifdef ENABLE_DARTS if (type == "ocd") { @@ -610,8 +645,14 @@ Config::NewFromFile(const std::string& fileName, std::string prefixedFileName; if (provider != nullptr) { try { - prefixedFileName = provider->Resolve(fileName); + const std::shared_ptr resource = + provider->GetResource(fileName); + impl->configDirectory = GetParentDirectory(resource->Name()); + return NewFromString(std::string(resource->Data(), resource->Size()), + provider, options); } catch (const FileNotFound&) { + // Some callers pass a provider for dictionaries only; keep normal config + // file lookup as a fallback when the provider cannot supply the config. prefixedFileName = impl->FindConfigFile(fileName); } } else { @@ -630,7 +671,12 @@ Config::NewFromFile(const std::string& fileName, if (slashPos != std::string::npos) { impl->configDirectory = prefixedFileName.substr(0, slashPos) + "/"; } - return NewFromString(content, provider, options); + std::shared_ptr effectiveProvider = provider; + if (effectiveProvider == nullptr) { + effectiveProvider = + NewFilesystemResourceProvider(impl->configDirectory, impl->paths); + } + return NewFromString(content, effectiveProvider, options); } ConverterPtr Config::NewFromFile(const std::string& fileName, diff --git a/src/ConfigTest.cpp b/src/ConfigTest.cpp index 0dab58ed3..054d49bd4 100644 --- a/src/ConfigTest.cpp +++ b/src/ConfigTest.cpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #if defined(_WIN32) || defined(_WIN64) @@ -41,6 +42,15 @@ namespace fs = std::filesystem; std::string PathString(const fs::path& path) { return path.u8string(); } +std::string NormalizePathString(std::string path) { + for (char& ch : path) { + if (ch == '\\') { + ch = '/'; + } + } + return path; +} + fs::path MakeTempDir(const std::string& name) { #if defined(_WIN32) || defined(_WIN64) const auto suffix = std::to_string(GetCurrentProcessId()); @@ -58,6 +68,85 @@ void WriteFile(const fs::path& path, const std::string& content) { ofs << content; } +void WriteLe16(std::ofstream& output, uint16_t value) { + output.put(static_cast(value & 0xff)); + output.put(static_cast((value >> 8) & 0xff)); +} + +void WriteLe32(std::ofstream& output, uint32_t value) { + output.put(static_cast(value & 0xff)); + output.put(static_cast((value >> 8) & 0xff)); + output.put(static_cast((value >> 16) & 0xff)); + output.put(static_cast((value >> 24) & 0xff)); +} + +struct ZipTestEntry { + std::string name; + std::string content; + uint32_t localHeaderOffset; +}; + +void WriteStoredZip(const fs::path& path, + std::vector> entries) { + std::ofstream output(path, std::ios::binary); + std::vector writtenEntries; + for (const auto& entry : entries) { + const std::string& name = entry.first; + const std::string& content = entry.second; + const uint32_t localHeaderOffset = + static_cast(output.tellp()); + WriteLe32(output, 0x04034b50); + WriteLe16(output, 20); + WriteLe16(output, 0); + WriteLe16(output, 0); + WriteLe16(output, 0); + WriteLe16(output, 0); + WriteLe32(output, 0); + WriteLe32(output, static_cast(content.size())); + WriteLe32(output, static_cast(content.size())); + WriteLe16(output, static_cast(name.size())); + WriteLe16(output, 0); + output.write(name.data(), static_cast(name.size())); + output.write(content.data(), static_cast(content.size())); + writtenEntries.push_back(ZipTestEntry{name, content, localHeaderOffset}); + } + + const uint32_t centralDirectoryOffset = + static_cast(output.tellp()); + for (const ZipTestEntry& entry : writtenEntries) { + WriteLe32(output, 0x02014b50); + WriteLe16(output, 20); + WriteLe16(output, 20); + WriteLe16(output, 0); + WriteLe16(output, 0); + WriteLe16(output, 0); + WriteLe16(output, 0); + WriteLe32(output, 0); + WriteLe32(output, static_cast(entry.content.size())); + WriteLe32(output, static_cast(entry.content.size())); + WriteLe16(output, static_cast(entry.name.size())); + WriteLe16(output, 0); + WriteLe16(output, 0); + WriteLe16(output, 0); + WriteLe16(output, 0); + WriteLe32(output, 0); + WriteLe32(output, entry.localHeaderOffset); + output.write(entry.name.data(), + static_cast(entry.name.size())); + } + const uint32_t centralDirectorySize = + static_cast(output.tellp()) - centralDirectoryOffset; + + WriteLe32(output, 0x06054b50); + WriteLe16(output, 0); + WriteLe16(output, 0); + WriteLe16(output, static_cast(writtenEntries.size())); + WriteLe16(output, static_cast(writtenEntries.size())); + WriteLe32(output, centralDirectorySize); + WriteLe32(output, centralDirectoryOffset); + WriteLe16(output, 0); +} + std::string SingleDictConfig(const std::string& dictFile) { return std::string("{\n" " \"name\": \"Resource Provider Test\",\n" @@ -230,6 +319,69 @@ TEST_F(ConfigTest, ExplicitProviderFindsConfigNameAndResources) { fs::remove_all(tempDir); } +TEST_F(ConfigTest, ZipProviderFindsConfigNameAndResources) { + const fs::path tempDir = MakeTempDir("opencc-zip-provider-test"); + const fs::path zipPath = tempDir / "resources.zip"; + WriteStoredZip(zipPath, { + {"config.json", SingleDictConfig("dict.txt")}, + {"dict.txt", utf8("鼠标\t滑鼠\n")}, + }); + + try { + std::shared_ptr provider( + new ZipResourceProvider(PathString(zipPath))); + const ConverterPtr tempConverter = + config.NewFromFile("config.json", provider); + EXPECT_EQ(utf8("滑鼠"), tempConverter->Convert(utf8("鼠标"))); + } catch (...) { + fs::remove_all(tempDir); + throw; + } + fs::remove_all(tempDir); +} + +TEST_F(ConfigTest, ZipProviderDoesNotOverrideAbsoluteConfigPath) { + const fs::path tempDir = MakeTempDir("opencc-zip-absolute-config-test"); + const fs::path zipPath = tempDir / "resources.zip"; + const fs::path configPath = tempDir / "config.json"; + WriteStoredZip(zipPath, { + {"config.json", + InlineSingleStepConfig( + "{\n" + " \"鼠标\": \"鼠标\"\n" + " }", + "{\n" + " \"type\": \"inline\",\n" + " \"entries\": {\n" + " \"鼠标\": \"乙\"\n" + " }\n" + " }")}, + }); + WriteFile(configPath, + InlineSingleStepConfig( + "{\n" + " \"鼠标\": \"鼠标\"\n" + " }", + "{\n" + " \"type\": \"inline\",\n" + " \"entries\": {\n" + " \"鼠标\": \"甲\"\n" + " }\n" + " }")); + + try { + std::shared_ptr provider( + new ZipResourceProvider(PathString(zipPath))); + const ConverterPtr tempConverter = + config.NewFromFile(PathString(configPath), provider); + EXPECT_EQ(utf8("甲"), tempConverter->Convert(utf8("鼠标"))); + } catch (...) { + fs::remove_all(tempDir); + throw; + } + fs::remove_all(tempDir); +} + TEST_F(ConfigTest, ExplicitProviderConfigOverridesInstalledOrCwdConfigName) { const fs::path tempDir = MakeTempDir("opencc-provider-config-override-test"); const fs::path cwdDir = tempDir / "cwd"; @@ -326,6 +478,29 @@ TEST_F(ConfigTest, MissingResourceListsSearchedPaths) { fs::remove_all(tempDir); } +TEST_F(ConfigTest, FilesystemResourceCacheKeyIncludesFreshness) { + const fs::path tempDir = MakeTempDir("opencc-resource-cache-key-test"); + const fs::path resourceDir = tempDir / "resources"; + fs::create_directories(resourceDir); + const fs::path dictPath = resourceDir / "dict.txt"; + WriteFile(dictPath, utf8("鼠标\t滑鼠\n")); + + try { + FilesystemResourceProvider provider({PathString(resourceDir)}); + const std::shared_ptr resource = + provider.GetResource("dict.txt"); + EXPECT_EQ(NormalizePathString(PathString(dictPath)), + NormalizePathString(resource->Name())); + const std::string oldKey = + resource->Name() + "\n" + std::to_string(resource->Size()); + EXPECT_NE(oldKey, resource->CacheKey()); + } catch (...) { + fs::remove_all(tempDir); + throw; + } + fs::remove_all(tempDir); +} + TEST_F(ConfigTest, PluginLikeResourcePathSupplementsMainPath) { const fs::path tempDir = MakeTempDir("opencc-plugin-resource-test"); const fs::path configDir = tempDir / "config"; diff --git a/src/Lexicon.cpp b/src/Lexicon.cpp index 9ac7e588f..f42987cfb 100644 --- a/src/Lexicon.cpp +++ b/src/Lexicon.cpp @@ -97,4 +97,38 @@ LexiconPtr Lexicon::ParseLexiconFromFile(FILE* fp) { return lexicon; } +LexiconPtr Lexicon::ParseLexiconFromBuffer(const char* data, size_t size) { + LexiconPtr lexicon(new Lexicon); + size_t offset = 0; + if (size >= 3 && static_cast(data[0]) == 0xef && + static_cast(data[1]) == 0xbb && + static_cast(data[2]) == 0xbf) { + offset = 3; + } + + size_t lineNum = 1; + while (offset < size) { + size_t lineEnd = offset; + while (lineEnd < size && data[lineEnd] != '\n') { + lineEnd++; + } + + std::string line(data + offset, lineEnd - offset); + if (!line.empty() && line.back() == '\r') { + line.pop_back(); + } + if (!line.empty() && line.front() != '#') { + line.push_back('\n'); + DictEntry* entry = ParseKeyValues(line.c_str(), lineNum); + if (entry != nullptr) { + lexicon->Add(entry); + } + } + + offset = lineEnd < size ? lineEnd + 1 : lineEnd; + lineNum++; + } + return lexicon; +} + } // namespace opencc diff --git a/src/Lexicon.hpp b/src/Lexicon.hpp index 61dcc59ed..8a07fd10a 100644 --- a/src/Lexicon.hpp +++ b/src/Lexicon.hpp @@ -63,6 +63,7 @@ class OPENCC_EXPORT Lexicon { } static LexiconPtr ParseLexiconFromFile(FILE* fp); + static LexiconPtr ParseLexiconFromBuffer(const char* data, size_t size); private: std::vector> entries; diff --git a/src/ResourceProvider.cpp b/src/ResourceProvider.cpp index bcc850e43..4118a448e 100644 --- a/src/ResourceProvider.cpp +++ b/src/ResourceProvider.cpp @@ -18,11 +18,19 @@ #include "ResourceProvider.hpp" +#include +#include +#include #include #include #if defined(_WIN32) || defined(_WIN64) +#include #include "WinUtil.hpp" +#else +#include +#include +#include #endif #include "Exception.hpp" @@ -32,6 +40,18 @@ namespace { bool IsSeparator(char ch) { return ch == '/' || ch == '\\'; } +uint16_t ReadLe16(const unsigned char* data) { + return static_cast(data[0]) | + (static_cast(data[1]) << 8); +} + +uint32_t ReadLe32(const unsigned char* data) { + return static_cast(data[0]) | + (static_cast(data[1]) << 8) | + (static_cast(data[2]) << 16) | + (static_cast(data[3]) << 24); +} + bool IsAbsolutePath(const std::string& path) { if (path.empty()) { return false; @@ -71,8 +91,318 @@ std::string JoinPath(const std::string& root, const std::string& resource) { return root + "/" + resource; } +std::string NormalizeResourceName(std::string_view resourceName) { + std::string normalized(resourceName); + for (char& ch : normalized) { + if (ch == '\\') { + ch = '/'; + } + } + while (!normalized.empty() && normalized.front() == '/') { + normalized.erase(normalized.begin()); + } + return normalized; +} + +bool IsSafeZipResourceName(const std::string& name) { + if (name.empty() || IsAbsolutePath(name)) { + return false; + } + size_t start = 0; + for (;;) { + const size_t pos = name.find('/', start); + const std::string part = name.substr(start, pos - start); + if (part.empty() || part == "." || part == "..") { + return false; + } + if (pos == std::string::npos) { + return true; + } + start = pos + 1; + } +} + +std::string BaseName(const std::string& path) { + const size_t pos = path.find_last_of('/'); + if (pos == std::string::npos) { + return path; + } + return path.substr(pos + 1); +} + +bool GetFileFreshnessCacheKey(const std::string& path, std::string* cacheKey) { +#if defined(_WIN32) || defined(_WIN64) + WIN32_FILE_ATTRIBUTE_DATA fileInfo; + const std::wstring widePath = internal::WideFromUtf8(path); + if (widePath.empty() || + !GetFileAttributesExW(widePath.c_str(), GetFileExInfoStandard, + &fileInfo)) { + return false; + } +#else + struct stat statBuf; + if (stat(path.c_str(), &statBuf) != 0) { + return false; + } +#endif + *cacheKey = path; + cacheKey->push_back('\n'); +#if defined(_WIN32) || defined(_WIN64) + cacheKey->append( + std::to_string(static_cast( + fileInfo.ftLastWriteTime.dwHighDateTime))); + cacheKey->push_back('.'); + cacheKey->append( + std::to_string(static_cast( + fileInfo.ftLastWriteTime.dwLowDateTime))); + cacheKey->push_back('\n'); + cacheKey->append( + std::to_string(static_cast(fileInfo.nFileSizeHigh))); + cacheKey->push_back('.'); + cacheKey->append( + std::to_string(static_cast(fileInfo.nFileSizeLow))); +#else + cacheKey->append(std::to_string(static_cast(statBuf.st_mtime))); + cacheKey->push_back('.'); +#if defined(__APPLE__) && defined(__MACH__) + cacheKey->append( + std::to_string(static_cast(statBuf.st_mtimespec.tv_nsec))); +#elif defined(st_mtime_nsec) + cacheKey->append( + std::to_string(static_cast(statBuf.st_mtime_nsec))); +#else + cacheKey->append( + std::to_string(static_cast(statBuf.st_mtim.tv_nsec))); +#endif + cacheKey->push_back('\n'); + cacheKey->append(std::to_string(static_cast(statBuf.st_size))); +#endif + return true; +} + +#if defined(_WIN32) || defined(_WIN64) +std::vector ReadBinaryFile(const std::string& path) { + FILE* file = _wfopen(internal::WideFromUtf8(path).c_str(), L"rb"); + if (file == nullptr) { + throw FileNotFound(path); + } + if (fseek(file, 0, SEEK_END) != 0) { + fclose(file); + throw FileNotFound(path); + } + const long fileSize = ftell(file); + if (fileSize < 0) { + fclose(file); + throw FileNotFound(path); + } + if (fseek(file, 0, SEEK_SET) != 0) { + fclose(file); + throw FileNotFound(path); + } + std::vector data(static_cast(fileSize)); + if (!data.empty() && + fread(data.data(), 1, data.size(), file) != data.size()) { + fclose(file); + throw FileNotFound(path); + } + fclose(file); + return data; +} +#endif + +struct ZipEntry { + uint16_t method; + uint32_t compressedSize; + uint32_t uncompressedSize; + uint32_t dataOffset; +}; + +class MappedZipArchive { +public: + explicit MappedZipArchive(const std::string& path) : fileName(path) { + if (!GetFileFreshnessCacheKey(path, &cacheKey)) { + throw FileNotFound(path); + } +#if defined(_WIN32) || defined(_WIN64) + buffer = ReadBinaryFile(path); + data = buffer.empty() ? nullptr : buffer.data(); + size = buffer.size(); +#else + fd = open(path.c_str(), O_RDONLY); + if (fd < 0) { + throw FileNotFound(path); + } + struct stat info; + if (fstat(fd, &info) != 0 || info.st_size < 0) { + close(fd); + fd = -1; + throw FileNotFound(path); + } + size = static_cast(info.st_size); + if (size == 0) { + close(fd); + fd = -1; + data = nullptr; + return; + } + void* mapped = mmap(nullptr, size, PROT_READ, MAP_PRIVATE, fd, 0); + if (mapped == MAP_FAILED) { + close(fd); + fd = -1; + throw FileNotFound(path); + } + data = static_cast(mapped); +#endif + } + + MappedZipArchive(const MappedZipArchive&) = delete; + MappedZipArchive& operator=(const MappedZipArchive&) = delete; + + ~MappedZipArchive() { +#if !defined(_WIN32) && !defined(_WIN64) + if (data != nullptr) { + munmap(const_cast(data), size); + } + if (fd >= 0) { + close(fd); + } +#endif + } + + const std::string fileName; + std::string cacheKey; + const unsigned char* data = nullptr; + size_t size = 0; + +private: +#if defined(_WIN32) || defined(_WIN64) + std::vector buffer; +#else + int fd = -1; +#endif +}; + } // namespace +struct ZipResourceProvider::Internal { + explicit Internal(const std::string& zipFileName_) + : archive(std::make_shared(zipFileName_)) { + Index(); + } + + void Index(); + + std::shared_ptr archive; + std::unordered_map entries; +}; + +void ZipResourceProvider::Internal::Index() { + const unsigned char* data = archive->data; + const size_t size = archive->size; + if (size < 22) { + throw InvalidFormat("Invalid zip archive: " + archive->fileName); + } + + const size_t maxCommentSize = 65535; + const size_t searchStart = + size > maxCommentSize + 22 ? size - maxCommentSize - 22 : 0; + size_t eocdOffset = std::string::npos; + for (size_t pos = size - 22;; pos--) { + if (data[pos] == 0x50 && data[pos + 1] == 0x4b && data[pos + 2] == 0x05 && + data[pos + 3] == 0x06) { + eocdOffset = pos; + break; + } + if (pos == searchStart) { + break; + } + } + if (eocdOffset == std::string::npos) { + throw InvalidFormat("Invalid zip archive: " + archive->fileName); + } + + const uint16_t entryCount = ReadLe16(&data[eocdOffset + 10]); + const uint32_t centralDirectorySize = ReadLe32(&data[eocdOffset + 12]); + const uint32_t centralDirectoryOffset = ReadLe32(&data[eocdOffset + 16]); + if (centralDirectoryOffset > size || + centralDirectorySize > size - centralDirectoryOffset) { + throw InvalidFormat("Invalid zip central directory: " + archive->fileName); + } + + size_t pos = centralDirectoryOffset; + for (uint16_t i = 0; i < entryCount; i++) { + if (pos + 46 > size || ReadLe32(&data[pos]) != 0x02014b50) { + throw InvalidFormat("Invalid zip central directory: " + archive->fileName); + } + const uint16_t method = ReadLe16(&data[pos + 10]); + const uint32_t compressedSize = ReadLe32(&data[pos + 20]); + const uint32_t uncompressedSize = ReadLe32(&data[pos + 24]); + const uint16_t fileNameLength = ReadLe16(&data[pos + 28]); + const uint16_t extraLength = ReadLe16(&data[pos + 30]); + const uint16_t commentLength = ReadLe16(&data[pos + 32]); + const uint32_t localHeaderOffset = ReadLe32(&data[pos + 42]); + const size_t next = pos + 46 + fileNameLength + extraLength + commentLength; + if (next > size) { + throw InvalidFormat("Invalid zip central directory: " + archive->fileName); + } + const std::string name( + reinterpret_cast(&data[pos + 46]), fileNameLength); + const std::string normalized = NormalizeResourceName(name); + if (!IsAbsolutePath(name) && !normalized.empty() && + normalized.back() != '/' && IsSafeZipResourceName(normalized)) { + if (localHeaderOffset + 30 > size || + ReadLe32(&data[localHeaderOffset]) != 0x04034b50) { + throw InvalidFormat("Invalid zip local header: " + archive->fileName); + } + const uint16_t localNameLength = ReadLe16(&data[localHeaderOffset + 26]); + const uint16_t localExtraLength = ReadLe16(&data[localHeaderOffset + 28]); + const size_t dataOffset = + localHeaderOffset + 30 + localNameLength + localExtraLength; + if (dataOffset > size || compressedSize > size - dataOffset) { + throw InvalidFormat("Invalid zip entry data: " + archive->fileName); + } + entries[normalized] = ZipEntry{ + method, compressedSize, uncompressedSize, + static_cast(dataOffset)}; + } + pos = next; + } +} + +ResourceProvider::Resource::Resource(std::string name_, const char* data_, + size_t size_, + std::shared_ptr owner_, + std::string cacheKey_) + : name(std::move(name_)), data(data_), size(size_), + owner(std::move(owner_)), cacheKey(std::move(cacheKey_)) {} + +std::shared_ptr +ResourceProvider::GetResource(std::string_view resourceName) const { + const std::string path = Resolve(resourceName); + std::shared_ptr content(new std::string); +#if defined(_WIN32) || defined(_WIN64) + const std::vector data = ReadBinaryFile(path); + content->assign(reinterpret_cast(data.data()), data.size()); +#else + std::ifstream input(path, std::ios::binary); + if (!input) { + throw FileNotFound(path); + } + content->assign(std::istreambuf_iterator(input), + std::istreambuf_iterator()); + if (input.bad()) { + throw FileNotFound(path); + } +#endif + + std::string cacheKey; + if (!GetFileFreshnessCacheKey(path, &cacheKey)) { + throw FileNotFound(path); + } + return std::make_shared(path, content->data(), content->size(), + content, cacheKey); +} + FilesystemResourceProvider::FilesystemResourceProvider( std::vector searchPaths_) : searchPaths(std::move(searchPaths_)) {} @@ -108,4 +438,53 @@ FilesystemResourceProvider::Resolve(std::string_view resourceName) const { throw FileNotFound(resourcePath + " (searched: " + searched.str() + ")"); } +ZipResourceProvider::ZipResourceProvider(std::string zipFileName) + : internal(new Internal(zipFileName)) {} + +ZipResourceProvider::~ZipResourceProvider() = default; + +std::string ZipResourceProvider::Resolve(std::string_view resourceName) const { + throw FileNotFound(std::string(resourceName)); +} + +std::shared_ptr +ZipResourceProvider::GetResource(std::string_view resourceName) const { + if (IsAbsolutePath(std::string(resourceName))) { + throw FileNotFound(std::string(resourceName)); + } + + const std::string normalized = NormalizeResourceName(resourceName); + if (!IsSafeZipResourceName(normalized)) { + throw FileNotFound(std::string(resourceName)); + } + + auto entry = internal->entries.find(normalized); + if (entry == internal->entries.end()) { + const std::string baseName = BaseName(normalized); + if (baseName != normalized) { + entry = internal->entries.find(baseName); + } + } + if (entry == internal->entries.end()) { + throw FileNotFound(normalized); + } + if (entry->second.method != 0) { + throw InvalidFormat("Unsupported zip compression method for " + entry->first); + } + if (entry->second.compressedSize != entry->second.uncompressedSize) { + throw InvalidFormat("Invalid stored zip entry size for " + entry->first); + } + + const char* data = reinterpret_cast( + internal->archive->data + entry->second.dataOffset); + std::string cacheKey = internal->archive->cacheKey; + cacheKey.push_back('\n'); + cacheKey.append(entry->first); + cacheKey.push_back('\n'); + cacheKey.append(std::to_string(entry->second.uncompressedSize)); + return std::make_shared( + entry->first, data, entry->second.uncompressedSize, internal->archive, + cacheKey); +} + } // namespace opencc diff --git a/src/ResourceProvider.hpp b/src/ResourceProvider.hpp index 0d38174cb..724f60b46 100644 --- a/src/ResourceProvider.hpp +++ b/src/ResourceProvider.hpp @@ -20,6 +20,7 @@ #include #include +#include #include #include "Export.hpp" @@ -28,9 +29,30 @@ namespace opencc { class OPENCC_EXPORT ResourceProvider { public: + class OPENCC_EXPORT Resource { + public: + Resource(std::string name_, const char* data_, size_t size_, + std::shared_ptr owner_, std::string cacheKey_); + + const std::string& Name() const { return name; } + const char* Data() const { return data; } + size_t Size() const { return size; } + const std::string& CacheKey() const { return cacheKey; } + + private: + std::string name; + const char* data; + size_t size; + std::shared_ptr owner; + std::string cacheKey; + }; + virtual ~ResourceProvider() = default; virtual std::string Resolve(std::string_view resourceName) const = 0; + + virtual std::shared_ptr + GetResource(std::string_view resourceName) const; }; class OPENCC_EXPORT FilesystemResourceProvider : public ResourceProvider { @@ -43,4 +65,22 @@ class OPENCC_EXPORT FilesystemResourceProvider : public ResourceProvider { std::vector searchPaths; }; +class OPENCC_EXPORT ZipResourceProvider : public ResourceProvider { +public: + explicit ZipResourceProvider(std::string zipFileName); + ~ZipResourceProvider(); + + ZipResourceProvider(const ZipResourceProvider&) = delete; + ZipResourceProvider& operator=(const ZipResourceProvider&) = delete; + + std::string Resolve(std::string_view resourceName) const override; + + std::shared_ptr + GetResource(std::string_view resourceName) const override; + +private: + struct Internal; + std::unique_ptr internal; +}; + } // namespace opencc diff --git a/src/TextDict.cpp b/src/TextDict.cpp index 53108dab7..71b220e9e 100644 --- a/src/TextDict.cpp +++ b/src/TextDict.cpp @@ -57,6 +57,17 @@ TextDictPtr TextDict::NewFromFile(FILE* fp) { return TextDictPtr(new TextDict(lexicon)); } +TextDictPtr TextDict::NewFromBuffer(const char* data, size_t size) { + const LexiconPtr& lexicon = Lexicon::ParseLexiconFromBuffer(data, size); + lexicon->Sort(); + std::string dupkey; + if (!lexicon->IsUnique(&dupkey)) { + throw InvalidFormat( + "The text dictionary contains duplicated keys: " + dupkey + "."); + } + return TextDictPtr(new TextDict(lexicon)); +} + TextDictPtr TextDict::NewFromDict(const Dict& dict) { return TextDictPtr(new TextDict(dict.GetLexicon())); } diff --git a/src/TextDict.hpp b/src/TextDict.hpp index f1cb67d92..46c6acf54 100644 --- a/src/TextDict.hpp +++ b/src/TextDict.hpp @@ -51,6 +51,8 @@ class OPENCC_EXPORT TextDict : public Dict, public SerializableDict { static TextDictPtr NewFromFile(FILE* fp); + static TextDictPtr NewFromBuffer(const char* data, size_t size); + static TextDictPtr NewFromSortedFile(FILE* fp); private: diff --git a/src/tools/CommandLineMain.cpp b/src/tools/CommandLineMain.cpp index e14733c43..2b6d478cd 100644 --- a/src/tools/CommandLineMain.cpp +++ b/src/tools/CommandLineMain.cpp @@ -37,6 +37,7 @@ #include "src/ConversionInspection.hpp" #include "src/Converter.hpp" #include "src/Exception.hpp" +#include "src/ResourceProvider.hpp" #include "src/Segments.hpp" #include "src/tools/CommandLineMain.hpp" #include "src/tools/PlatformIO.hpp" @@ -550,7 +551,40 @@ int CommandLineMain(std::vector args) { "default, the command line tool skips these dictionaries.", cmd, false); const std::string argv0String = args.empty() ? std::string() : args[0]; - cmd.parse(args); + Optional resourceZipFileName = + Optional::Null(); + std::vector visibleArgs; + if (!args.empty()) { + visibleArgs.push_back(args[0]); + } + for (size_t i = 1; i < args.size(); i++) { + const std::string& arg = args[i]; + std::string value; + if (arg == "--resource-zip") { + if (i + 1 >= args.size() || args[i + 1].empty() || + args[i + 1][0] == '-') { + std::cerr << "error: Missing value for " << arg << std::endl; + return 1; + } + value = args[++i]; + } else if (arg.rfind("--resource-zip=", 0) == 0) { + value = arg.substr(std::string("--resource-zip=").size()); + } else { + visibleArgs.push_back(arg); + continue; + } + if (value.empty()) { + std::cerr << "error: Missing value for " << arg << std::endl; + return 1; + } + if (!resourceZipFileName.IsNull()) { + std::cerr << "error: resource zip specified more than once." + << std::endl; + return 1; + } + resourceZipFileName = Optional(value); + } + cmd.parse(visibleArgs); // Validate mutual exclusion and dependencies if (segmentationArg.getValue() && inspectArg.getValue()) { @@ -586,9 +620,15 @@ int CommandLineMain(std::vector args) { ConfigLoadOptions configOptions; configOptions.includeTofuRiskDictionaries = includeTofuRiskDictionariesArg.getValue(); - converter = - config.NewFromFile(configFileName, pathArg.getValue(), argv0, - configOptions); + if (!resourceZipFileName.IsNull()) { + std::shared_ptr provider( + new ZipResourceProvider(resourceZipFileName.Get())); + converter = config.NewFromFile(configFileName, provider, configOptions); + } else { + converter = + config.NewFromFile(configFileName, pathArg.getValue(), argv0, + configOptions); + } measurement.loadMs += DurationToMilliseconds(std::chrono::steady_clock::now() - loadStart); bool lineByLine = inputFileName.IsNull(); diff --git a/test/BUILD.bazel b/test/BUILD.bazel index befdb101d..e5ebbc5a5 100644 --- a/test/BUILD.bazel +++ b/test/BUILD.bazel @@ -36,6 +36,7 @@ cc_test( ], data = [ "//data/config", + "//data:opencc_resources_zip", "//data/dictionary:binary_dictionaries", "//data/dictionary:text_dictionaries", "//src/tools:command_line", diff --git a/test/CommandLineConvertTest.cpp b/test/CommandLineConvertTest.cpp index dfc454977..e31a1d526 100644 --- a/test/CommandLineConvertTest.cpp +++ b/test/CommandLineConvertTest.cpp @@ -144,6 +144,12 @@ class CommandLineConvertTest : public ::testing::Test { #endif } +#ifdef BAZEL + std::string ResourceZipFile() const { + return runfiles_->Rlocation("_main/data/opencc-resources.zip"); + } +#endif + std::string ConfigurationDirectory() const { #ifdef BAZEL return ""; @@ -206,6 +212,22 @@ class CommandLineConvertTest : public ::testing::Test { #endif } +#ifdef BAZEL + std::string TestResourceZipCommand(const std::string& config, + const std::string& inputFile, + const std::string& outputFile) const { + std::string cmd = QuotePath(OpenccCommand()) + " -i " + + QuotePath(inputFile) + " -o " + QuotePath(outputFile) + + " -c " + QuotePath(config + ".json") + + " --resource-zip " + QuotePath(ResourceZipFile()); +#ifdef _WIN32 + return "\"" + cmd + "\""; +#else + return cmd; +#endif + } +#endif + std::string TestCommandWithFlags(const std::string& config, const std::string& inputFile, const std::string& outputFile, @@ -397,6 +419,23 @@ TEST_F(CommandLineConvertTest, IncludeTofuRiskDictionariesFlagRestoresLegacy) { EXPECT_EQ("𫝈", GetFileContents(outputFile)); } +#ifdef BAZEL +TEST_F(CommandLineConvertTest, ResourceZipConvertsWithoutResourcePaths) { + const std::string inputFile = InputFile("resource_zip"); + const std::string outputFile = OutputFile("resource_zip"); + + { + std::ofstream ofs(inputFile, std::ios::binary); + ASSERT_TRUE(ofs.is_open()); + ofs << "打印机和鼠标"; + } + + ASSERT_EQ(0, + RunCommand(TestResourceZipCommand("s2twp", inputFile, outputFile))); + EXPECT_EQ("印表機和滑鼠", GetFileContents(outputFile)); +} +#endif + TEST_F(CommandLineConvertTest, StdinPreservesLineEndingsAndUnknownCharacters) { const std::string config = "s2t"; const std::string inputFile = InputFile("stdin_line_endings"); diff --git a/test/testcases/testcases.json b/test/testcases/testcases.json index f905f1d1f..92b0e3e6c 100755 --- a/test/testcases/testcases.json +++ b/test/testcases/testcases.json @@ -530,6 +530,13 @@ "tw2sp": "对乙酰氨基酚和对乙酰氨基酚指的是同一种退烧止痛药" } }, + { + "id": "medical_phrases_tw_aids_love_taste_tw2sp", + "input": "愛滋病患也能嚐到戀愛滋味", + "expected": { + "tw2sp": "艾滋病人也能尝到恋爱滋味" + } + }, { "id": "medical_phrases_tw_hepatitis_s2twp", "input": "乙肝和丙肝患者要定期追蹤",