Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions data/BUILD.bazel
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
package(default_visibility = ["//visibility:public"])

genrule(
name = "opencc_resources_zip",
srcs = [
"//data/config:config",
"//data/dictionary:text_dictionaries",
],
outs = ["opencc-resources.zip"],
cmd = (
"$(location //data/scripts:opencc_resources_zip) "
+ "--output $@ "
+ "--configs $(locations //data/config:config) "
+ "--dicts $(locations //data/dictionary:text_dictionaries)"
),
tools = ["//data/scripts:opencc_resources_zip"],
)
5 changes: 5 additions & 0 deletions data/scripts/BUILD.bazel
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@ py_binary(
srcs = ["extract_tofu_risk.py"],
)

py_binary(
name = "opencc_resources_zip",
srcs = ["opencc_resources_zip.py"],
)

py_test(
name = "common_test",
size = "small",
Expand Down
79 changes: 79 additions & 0 deletions data/scripts/opencc_resources_zip.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
#!/usr/bin/env python3

import argparse
import json
import os
import time
import zipfile


def parse_args():
parser = argparse.ArgumentParser(
description="Build an OpenCC share archive backed by text dictionaries."
)
parser.add_argument("--output", required=True)
parser.add_argument("--configs", nargs="+", required=True)
parser.add_argument("--dicts", nargs="+", required=True)
return parser.parse_args()


def convert_dict_references(value):
if isinstance(value, dict):
converted = {}
for key, child in value.items():
if key == "type" and child == "ocd2":
converted[key] = "text"
elif key == "file" and isinstance(child, str) and child.endswith(".ocd2"):
converted[key] = child[:-5] + ".txt"
else:
converted[key] = convert_dict_references(child)
return converted
if isinstance(value, list):
return [convert_dict_references(child) for child in value]
return value


def read_text_config(path):
with open(path, encoding="utf-8") as file:
config = json.load(file)
converted = convert_dict_references(config)
return json.dumps(converted, ensure_ascii=False, indent=2) + "\n"


def read_clean_dictionary(path):
cleaned = []
with open(path, encoding="utf-8") as file:
for line in file:
if not line.strip() or line.lstrip().startswith("#"):
continue
cleaned.append(line)
return "".join(cleaned)


def write_entry(archive, name, content, date_time):
info = zipfile.ZipInfo(name)
info.date_time = date_time
info.compress_type = zipfile.ZIP_STORED
info.external_attr = 0o644 << 16
archive.writestr(info, content.encode("utf-8"))


def main():
args = parse_args()
output_dir = os.path.dirname(args.output)
if output_dir:
os.makedirs(output_dir, exist_ok=True)
date_time = time.localtime()[:6]
with zipfile.ZipFile(args.output, "w") as archive:
for path in sorted(args.configs, key=os.path.basename):
write_entry(
archive, os.path.basename(path), read_text_config(path), date_time
)
for path in sorted(args.dicts, key=os.path.basename):
write_entry(
archive, os.path.basename(path), read_clean_dictionary(path), date_time
)


if __name__ == "__main__":
main()
7 changes: 7 additions & 0 deletions node/cli.js
Original file line number Diff line number Diff line change
Expand Up @@ -87,6 +87,7 @@ function parseArgs(args) {
config: 's2t.json',
input: null,
output: null,
resourceZip: null,
includeTofuRiskDictionaries: false,
help: false,
version: false,
Expand All @@ -103,6 +104,11 @@ function parseArgs(args) {
i += 1;
} else if (arg.startsWith('--config=')) {
options.config = readInlineOptionValue(arg, '--config');
} else if (arg === '--resource-zip') {
options.resourceZip = readOptionValue(args, i, arg);
i += 1;
} else if (arg.startsWith('--resource-zip=')) {
options.resourceZip = readInlineOptionValue(arg, '--resource-zip');
} else if (arg === '-i' || arg === '--input') {
options.input = readOptionValue(args, i, arg);
i += 1;
Expand Down Expand Up @@ -224,6 +230,7 @@ function main() {
try {
const converter = new OpenCC(resolveConfigPath(options.config), {
includeTofuRiskDictionaries: options.includeTofuRiskDictionaries,
resourceZip: options.resourceZip,
});
convertStream(converter, options, (error) => {
if (error) {
Expand Down
20 changes: 20 additions & 0 deletions node/opencc.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include "src/Converter.hpp"
#include "src/DictConverter.hpp"
#include "src/Exception.hpp"
#include "src/ResourceProvider.hpp"

using namespace opencc;

Expand Down Expand Up @@ -57,6 +58,25 @@ class OpenccBinding : public Napi::ObjectWrap<OpenccBinding> {
: Napi::ObjectWrap<OpenccBinding>(info), config_(), converter_() {
Napi::Env env = info.Env();

if (info.Length() >= 3 && info[0].IsString() && info[1].IsString() &&
info[2].IsBoolean()) {
// Three-argument mode:
// NewFromFile(configFileName, ZipResourceProvider(resourceZipFileName)).
const std::string configFile = ToUtf8String(info[0]);
const std::string resourceZipFile = ToUtf8String(info[1]);
ConfigLoadOptions options;
options.includeTofuRiskDictionaries =
info[2].As<Napi::Boolean>().Value();
try {
std::shared_ptr<ResourceProvider> provider(
new ZipResourceProvider(resourceZipFile));
converter_ = config_.NewFromFile(configFile, provider, options);
} catch (opencc::Exception& e) {
Napi::Error::New(env, e.what()).ThrowAsJavaScriptException();
}
return;
}

if (info.Length() >= 2 && info[0].IsString() && info[1].IsString()) {
// Two-argument mode: NewFromString(jsonString, configDirectory)
// Used by the JS layer to pass patched JSON with absolute paths.
Expand Down
1 change: 1 addition & 0 deletions node/opencc.d.cts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
interface OpenCCOptions {
includeTofuRiskDictionaries?: boolean;
resourceZip?: string;
}

declare class OpenCC {
Expand Down
1 change: 1 addition & 0 deletions node/opencc.d.mts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
interface OpenCCOptions {
includeTofuRiskDictionaries?: boolean;
resourceZip?: string;
}

declare class OpenCC {
Expand Down
1 change: 1 addition & 0 deletions node/opencc.d.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
interface OpenCCOptions {
includeTofuRiskDictionaries?: boolean;
resourceZip?: string;
}

declare class OpenCC {
Expand Down
9 changes: 9 additions & 0 deletions node/opencc.js
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,15 @@ const OpenCC = module.exports = function (config, options) {
const includeTofuRiskDictionaries =
options.includeTofuRiskDictionaries !== false;

if (options.resourceZip) {
this.handler = new binding.Opencc(
config,
options.resourceZip,
includeTofuRiskDictionaries
);
return;
}

// When opencc-jieba is installed, check if the requested config is a jieba
// config. If so, load its JSON, patch all paths to absolute, and pass the
// patched JSON string directly to the C++ layer via NewFromString.
Expand Down
54 changes: 50 additions & 4 deletions src/Config.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -351,11 +351,46 @@ class ConfigInternal {
throw FileNotFound(path);
}

DictPtr LoadTextMarisaDictWithResourceProvider(const std::string& fileName) {
if (resourceProvider == nullptr) {
throw FileNotFound(fileName);
}

const std::shared_ptr<const ResourceProvider::Resource> resource =
resourceProvider->GetResource(fileName);
std::string cacheKey = "text-marisa\n" + resource->CacheKey();
{
std::lock_guard<std::mutex> lock(DictCacheMutex());
PruneExpiredDictCache();
const auto cached = DictCache().find(cacheKey);
if (cached != DictCache().end()) {
DictPtr dict = cached->second.lock();
if (dict != nullptr) {
return dict;
}
}
}

TextDictPtr textDict = TextDict::NewFromBuffer(resource->Data(),
resource->Size());
DictPtr dict = MarisaDict::NewFromDict(*textDict.get());
{
std::lock_guard<std::mutex> lock(DictCacheMutex());
PruneExpiredDictCache();
std::weak_ptr<Dict>& cached = DictCache()[cacheKey];
DictPtr cachedDict = cached.lock();
if (cachedDict == nullptr) {
cached = dict;
return dict;
}
return cachedDict;
}
}

DictPtr LoadDictFromFile(const std::string& type,
const std::string& fileName) {
if (type == "text") {
DictPtr dict = LoadDictWithResourceProvider<TextDict>("text", fileName);
return MarisaDict::NewFromDict(*dict.get());
return LoadTextMarisaDictWithResourceProvider(fileName);
}
#ifdef ENABLE_DARTS
if (type == "ocd") {
Expand Down Expand Up @@ -610,8 +645,14 @@ Config::NewFromFile(const std::string& fileName,
std::string prefixedFileName;
if (provider != nullptr) {
try {
prefixedFileName = provider->Resolve(fileName);
const std::shared_ptr<const ResourceProvider::Resource> resource =
provider->GetResource(fileName);
impl->configDirectory = GetParentDirectory(resource->Name());
return NewFromString(std::string(resource->Data(), resource->Size()),
provider, options);
} catch (const FileNotFound&) {
// Some callers pass a provider for dictionaries only; keep normal config
// file lookup as a fallback when the provider cannot supply the config.
prefixedFileName = impl->FindConfigFile(fileName);
}
} else {
Expand All @@ -630,7 +671,12 @@ Config::NewFromFile(const std::string& fileName,
if (slashPos != std::string::npos) {
impl->configDirectory = prefixedFileName.substr(0, slashPos) + "/";
}
return NewFromString(content, provider, options);
std::shared_ptr<ResourceProvider> effectiveProvider = provider;
if (effectiveProvider == nullptr) {
effectiveProvider =
NewFilesystemResourceProvider(impl->configDirectory, impl->paths);
}
return NewFromString(content, effectiveProvider, options);
}

ConverterPtr Config::NewFromFile(const std::string& fileName,
Expand Down
Loading
Loading