Skip to content

Commit ea38e80

Browse files
committed
Adding byte order mark detection and updating the readme
1 parent 3dc025b commit ea38e80

File tree

8 files changed

+260
-138
lines changed

8 files changed

+260
-138
lines changed

README.md

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ Determine the encoding and language of text files!
1212

1313
- Detects 40 languages as well as the appropriate encoding
1414
- Available as CLI, in Node.js and in the browser
15-
- Supports .txt, .srt, and .sub
15+
- Supports .txt, .srt, .sub, .html, .csv, .tsv
1616
- Works best with large inputs
1717
- Completely free, no API key required
1818

@@ -261,7 +261,6 @@ The confidence score ranges from 0 to 1. It is based on the amount of matches th
261261
## Known Issues
262262

263263
- Unable to detect Shift-JIS encoded Japanese text files when using Node.js. Solutions are welcome!
264-
- Unable to detect UTF-16-LE encoded files when using Node.js. Solutions are welcome!
265264

266265
## License
267266

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
const byteOrderMarks = require("../config/byteOrderMarkObject.js");
2+
3+
module.exports = (uInt8Start) => {
4+
for (const element of byteOrderMarks) {
5+
if (element.regex.test(uInt8Start)) return element.encoding;
6+
}
7+
8+
return null;
9+
};

src/components/processContent.js

Lines changed: 41 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -1,34 +1,41 @@
1-
const countAllMatches = require('./processing-content/countAllMatches.js');
2-
const calculateConfidenceScore = require('./processing-content/calculateConfidenceScore.js');
3-
4-
module.exports = (data) => {
5-
const fileInfo = {};
6-
const languageArr = require('../language-config/languageObject.js');
7-
8-
data.languageArr = countAllMatches(data, languageArr);
9-
10-
fileInfo.language = data.languageArr.reduce((acc, val) => acc.count > val.count ? acc : val).name;
11-
12-
// "pos" gives us the position in the language array that has the most matches
13-
data.pos = data.languageArr.findIndex(elem => elem.name === fileInfo.language);
14-
15-
// Determine the encoding
16-
fileInfo.encoding = data.utf8 ? "UTF-8" : data.languageArr[data.pos].encoding;
17-
18-
const calculations = calculateConfidenceScore(data, fileInfo);
19-
20-
if (data.testFilePath) {
21-
return calculations;
22-
}
23-
24-
fileInfo.confidence = calculations;
25-
26-
// Edge case, when no matches were found
27-
if (!data.languageArr[data.pos].count) {
28-
fileInfo.language = null;
29-
fileInfo.encoding = data.utf8 ? "UTF-8" : null;
30-
fileInfo.confidence = data.utf8 ? 1 : null;
31-
}
32-
33-
return fileInfo;
34-
};
1+
const countAllMatches = require("./processing-content/countAllMatches.js");
2+
const calculateConfidenceScore = require("./processing-content/calculateConfidenceScore.js");
3+
4+
module.exports = (data, fileInfo) => {
5+
data.languageArr = countAllMatches(data, fileInfo.encoding);
6+
7+
fileInfo.language = data.languageArr.reduce((acc, val) =>
8+
acc.count > val.count ? acc : val
9+
).name;
10+
11+
// "pos" gives us the position in the language array that has the most matches
12+
data.pos = data.languageArr.findIndex(
13+
(elem) => elem.name === fileInfo.language
14+
);
15+
16+
// Determine the encoding
17+
if (!fileInfo.encoding) {
18+
fileInfo.encoding = data.languageArr[data.pos].encoding;
19+
}
20+
21+
const calculations = calculateConfidenceScore(data, fileInfo);
22+
23+
if (data.testFile) {
24+
return calculations;
25+
}
26+
27+
if (fileInfo.confidence.encoding) {
28+
fileInfo.confidence.language = calculations;
29+
} else {
30+
fileInfo.confidence.encoding = calculations;
31+
fileInfo.confidence.language = calculations;
32+
}
33+
34+
// Edge case, when no matches were found
35+
if (!data.languageArr[data.pos].count) {
36+
fileInfo.language = null;
37+
fileInfo.confidence.language = null;
38+
}
39+
40+
return fileInfo;
41+
};
Lines changed: 70 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -1,70 +1,80 @@
11
module.exports = (data, fileInfo) => {
2-
const charRegex = new RegExp(/\d|\n|\s|\-|\.|\,|\:|\;|\?|\!|\<|\>|\[|\]|\{|\}|\&|\=|\|/, "g");
3-
const totalCharacters = data.content.replace(charRegex, "").length;
4-
const langArr = data.languageArr;
5-
const pos = data.pos;
6-
const testFilePath = data.testFilePath;
2+
const charRegex = new RegExp(
3+
/\d|\n|\s|\-|\.|\,|\:|\;|\?|\!|\<|\>|\[|\]|\{|\}|\&|\=|\|/,
4+
"g"
5+
);
6+
const totalCharacters = data.content.replace(charRegex, "").length;
7+
const langArr = data.languageArr;
8+
const pos = data.pos;
9+
const testFile = data.testFile;
710

8-
const secondLanguage = langArr.reduce((acc, val) => {
9-
if (acc.name === fileInfo.language) return val;
10-
if (val.name === fileInfo.language) return acc;
11+
const secondLanguage = langArr.reduce((acc, val) => {
12+
if (acc.name === fileInfo.language) return val;
13+
if (val.name === fileInfo.language) return acc;
1114

12-
return acc.count >= val.count ? acc : val;
13-
});
15+
return acc.count >= val.count ? acc : val;
16+
});
1417

15-
const languageRatio = langArr[pos].count / (secondLanguage.count + langArr[pos].count);
16-
const characterWordRatio = langArr[pos].count / totalCharacters;
18+
const languageRatio =
19+
langArr[pos].count / (secondLanguage.count + langArr[pos].count);
20+
const characterWordRatio = langArr[pos].count / totalCharacters;
1721

18-
let lowerLimit = null;
19-
let upperLimit = null;
20-
const multiplier = 0.8;
22+
let lowerLimit = null;
23+
let upperLimit = null;
24+
const multiplier = 0.8;
2125

22-
if (data.utf8) {
23-
lowerLimit = langArr[pos].utfFrequency ? langArr[pos].utfFrequency.low * multiplier : null;
24-
upperLimit = langArr[pos].utfFrequency ? (langArr[pos].utfFrequency.low + langArr[pos].utfFrequency.high) / 2 : null;
26+
if (fileInfo.encoding === "UTF-8" || fileInfo.encoding === "UTF-16LE") {
27+
lowerLimit = langArr[pos].utfFrequency
28+
? langArr[pos].utfFrequency.low * multiplier
29+
: null;
30+
upperLimit = langArr[pos].utfFrequency
31+
? (langArr[pos].utfFrequency.low + langArr[pos].utfFrequency.high) / 2
32+
: null;
33+
} else {
34+
lowerLimit = langArr[pos].isoFrequency
35+
? langArr[pos].isoFrequency.low * multiplier
36+
: null;
37+
upperLimit = langArr[pos].isoFrequency
38+
? (langArr[pos].isoFrequency.low + langArr[pos].isoFrequency.high) / 2
39+
: null;
40+
}
2541

26-
} else {
27-
lowerLimit = langArr[pos].isoFrequency ? langArr[pos].isoFrequency.low * multiplier : null;
28-
upperLimit = langArr[pos].isoFrequency ? (langArr[pos].isoFrequency.low + langArr[pos].isoFrequency.high) / 2 : null;
29-
}
42+
let confidenceScore;
3043

31-
let confidenceScore;
44+
if (!lowerLimit || !upperLimit) {
45+
confidenceScore = null;
46+
} else if (characterWordRatio >= upperLimit) {
47+
confidenceScore = 1;
48+
} else if (characterWordRatio > lowerLimit) {
49+
const range = upperLimit - lowerLimit;
50+
const surplus = characterWordRatio - lowerLimit;
51+
const confidenceRaisePercentage = surplus / range;
52+
const confidenceRaise = (1 - languageRatio) * confidenceRaisePercentage;
53+
confidenceScore = Number((languageRatio + confidenceRaise).toFixed(2));
54+
} else {
55+
confidenceScore = Number(
56+
(languageRatio * (characterWordRatio / lowerLimit)).toFixed(2)
57+
);
58+
}
3259

33-
if (!lowerLimit || !upperLimit) {
34-
confidenceScore = null;
60+
// If the test script is running
61+
if (testFile) {
62+
return {
63+
name: testFile.substr(testFile.lastIndexOf("/") + 1),
64+
path: testFile,
65+
encoding: fileInfo.encoding,
66+
language: fileInfo.language,
67+
languageConfidence: confidenceScore,
68+
ratio: Number(languageRatio.toFixed(2)),
69+
count: langArr[pos].count,
70+
totalCharacters: totalCharacters,
71+
characterWordRatio: characterWordRatio.toFixed(6),
72+
secondLanguage: {
73+
name: secondLanguage.name,
74+
count: secondLanguage.count,
75+
},
76+
};
77+
}
3578

36-
} else if (characterWordRatio >= upperLimit) {
37-
confidenceScore = 1;
38-
39-
} else if (characterWordRatio > lowerLimit) {
40-
const range = upperLimit - lowerLimit;
41-
const surplus = characterWordRatio - lowerLimit;
42-
const confidenceRaisePercentage = surplus / range;
43-
const confidenceRaise = (1 - languageRatio) * confidenceRaisePercentage;
44-
confidenceScore = Number((languageRatio + confidenceRaise).toFixed(2));
45-
46-
} else {
47-
confidenceScore = Number((languageRatio * (characterWordRatio / lowerLimit)).toFixed(2));
48-
}
49-
50-
// If the test script is running
51-
if (testFilePath) {
52-
return {
53-
name: testFilePath.substr(testFilePath.lastIndexOf('/') + 1),
54-
path: testFilePath,
55-
language: fileInfo.language,
56-
utf8: data.utf8,
57-
confidence: confidenceScore,
58-
ratio: Number(languageRatio.toFixed(2)),
59-
count: langArr[pos].count,
60-
totalCharacters: totalCharacters,
61-
characterWordRatio: characterWordRatio.toFixed(6),
62-
secondLanguage: {
63-
name: secondLanguage.name,
64-
count: secondLanguage.count
65-
}
66-
};
67-
}
68-
69-
return confidenceScore;
70-
};
79+
return confidenceScore;
80+
};
Lines changed: 25 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,29 +1,31 @@
1-
module.exports = (data, languageArr) => {
2-
const newLanguageArr = [];
1+
const languageArr = require("../../config/languageObject.js");
32

4-
// Cloning the language array and making sure that "count" has no reference to "languageArr"!
5-
languageArr.forEach((obj) => {
6-
const updatedLangObj = {};
7-
Object.keys(obj).forEach(key => {
8-
if (key !== "count") {
9-
updatedLangObj[key] = obj[key];
10-
} else {
11-
updatedLangObj.count = 0;
12-
}
13-
});
14-
newLanguageArr.push(updatedLangObj);
3+
module.exports = (data, encoding) => {
4+
const newLanguageArr = [];
5+
6+
// Cloning the language array and making sure that "count" has no reference to "languageArr"!
7+
languageArr.forEach((obj) => {
8+
const updatedLangObj = {};
9+
Object.keys(obj).forEach((key) => {
10+
if (key !== "count") {
11+
updatedLangObj[key] = obj[key];
12+
} else {
13+
updatedLangObj.count = 0;
14+
}
1515
});
16+
newLanguageArr.push(updatedLangObj);
17+
});
1618

17-
const regex = data.utf8 ? "utfRegex" : "isoRegex";
19+
const regex = encoding ? "utfRegex" : "isoRegex";
1820

19-
// Populate the count property of our language array!
20-
newLanguageArr.forEach(lang => {
21-
if (lang[regex]) {
22-
const matches = data.content.match(lang[regex]);
21+
// Populating the count property of the language array
22+
newLanguageArr.forEach((lang) => {
23+
if (lang[regex]) {
24+
const matches = data.content.match(lang[regex]);
2325

24-
if (matches) lang.count = matches.length;
25-
}
26-
});
26+
if (matches) lang.count = matches.length;
27+
}
28+
});
2729

28-
return newLanguageArr;
29-
}
30+
return newLanguageArr;
31+
};

src/config/byteOrderMarkObject.js

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
module.exports = [
2+
{
3+
encoding: "UTF-EBCDIC",
4+
regex: new RegExp("221 115 102 115"),
5+
},
6+
{
7+
encoding: "GB-18030",
8+
regex: new RegExp("132 49 149 51"),
9+
},
10+
{
11+
encoding: "UTF-32LE",
12+
regex: new RegExp("255 254 0 0"),
13+
},
14+
{
15+
encoding: "UTF-32BE",
16+
regex: new RegExp("0 0 254 255"),
17+
},
18+
{
19+
encoding: "UTF-8",
20+
regex: new RegExp("239 187 191"),
21+
},
22+
{
23+
encoding: "UTF-7",
24+
regex: new RegExp("43 47 118"),
25+
},
26+
{
27+
encoding: "UTF-1",
28+
regex: new RegExp("247 100 76"),
29+
},
30+
{
31+
encoding: "SCSU",
32+
regex: new RegExp("14 254 255"),
33+
},
34+
{
35+
encoding: "BOCU-1",
36+
regex: new RegExp("251 238 40"),
37+
},
38+
{
39+
encoding: "UTF-16BE",
40+
regex: new RegExp("254 255"),
41+
},
42+
{
43+
encoding: "UTF-16LE",
44+
regex: new RegExp("255 254"),
45+
},
46+
];

0 commit comments

Comments
 (0)