Adding byte order mark detection and updating the readme

gignupg · gignupg · commit ea38e804c31b · 2021-04-19T16:12:51.000+02:00
diff --git a/README.md b/README.md
@@ -12,7 +12,7 @@ Determine the encoding and language of text files!
 
 - Detects 40 languages as well as the appropriate encoding
 - Available as CLI, in Node.js and in the browser
-- Supports .txt, .srt, and .sub
+- Supports .txt, .srt, .sub, .html, .csv, .tsv
 - Works best with large inputs
 - Completely free, no API key required
 
@@ -261,7 +261,6 @@ The confidence score ranges from 0 to 1. It is based on the amount of matches th
 ## Known Issues
 
 - Unable to detect Shift-JIS encoded Japanese text files when using Node.js. Solutions are welcome!
-- Unable to detect UTF-16-LE encoded files when using Node.js. Solutions are welcome!
 
 ## License
 
diff --git a/src/components/checkByteOrderMark.js b/src/components/checkByteOrderMark.js
@@ -0,0 +1,9 @@
+const byteOrderMarks = require("../config/byteOrderMarkObject.js");
+
+module.exports = (uInt8Start) => {
+  for (const element of byteOrderMarks) {
+    if (element.regex.test(uInt8Start)) return element.encoding;
+  }
+
+  return null;
+};
diff --git a/src/components/processContent.js b/src/components/processContent.js
@@ -1,34 +1,41 @@
-const countAllMatches = require('./processing-content/countAllMatches.js');
-const calculateConfidenceScore = require('./processing-content/calculateConfidenceScore.js');
-
-module.exports = (data) => {
-    const fileInfo = {};
-    const languageArr = require('../language-config/languageObject.js');
-
-    data.languageArr = countAllMatches(data, languageArr);
-
-    fileInfo.language = data.languageArr.reduce((acc, val) => acc.count > val.count ? acc : val).name;
-
-    // "pos" gives us the position in the language array that has the most matches
-    data.pos = data.languageArr.findIndex(elem => elem.name === fileInfo.language);
-
-    // Determine the encoding
-    fileInfo.encoding = data.utf8 ? "UTF-8" : data.languageArr[data.pos].encoding;
-
-    const calculations = calculateConfidenceScore(data, fileInfo);
-
-    if (data.testFilePath) {
-        return calculations;
-    }
-
-    fileInfo.confidence = calculations;
-
-    // Edge case, when no matches were found
-    if (!data.languageArr[data.pos].count) {
-        fileInfo.language = null;
-        fileInfo.encoding = data.utf8 ? "UTF-8" : null;
-        fileInfo.confidence = data.utf8 ? 1 : null;
-    }
-
-    return fileInfo;
-};
+const countAllMatches = require("./processing-content/countAllMatches.js");
+const calculateConfidenceScore = require("./processing-content/calculateConfidenceScore.js");
+
+module.exports = (data, fileInfo) => {
+  data.languageArr = countAllMatches(data, fileInfo.encoding);
+
+  fileInfo.language = data.languageArr.reduce((acc, val) =>
+    acc.count > val.count ? acc : val
+  ).name;
+
+  // "pos" gives us the position in the language array that has the most matches
+  data.pos = data.languageArr.findIndex(
+    (elem) => elem.name === fileInfo.language
+  );
+
+  // Determine the encoding
+  if (!fileInfo.encoding) {
+    fileInfo.encoding = data.languageArr[data.pos].encoding;
+  }
+
+  const calculations = calculateConfidenceScore(data, fileInfo);
+
+  if (data.testFile) {
+    return calculations;
+  }
+
+  if (fileInfo.confidence.encoding) {
+    fileInfo.confidence.language = calculations;
+  } else {
+    fileInfo.confidence.encoding = calculations;
+    fileInfo.confidence.language = calculations;
+  }
+
+  // Edge case, when no matches were found
+  if (!data.languageArr[data.pos].count) {
+    fileInfo.language = null;
+    fileInfo.confidence.language = null;
+  }
+
+  return fileInfo;
+};
diff --git a/src/components/processing-content/calculateConfidenceScore.js b/src/components/processing-content/calculateConfidenceScore.js
@@ -1,70 +1,80 @@
 module.exports = (data, fileInfo) => {
-    const charRegex = new RegExp(/\d|\n|\s|\-|\.|\,|\:|\;|\?|\!|\<|\>|\[|\]|\{|\}|\&|\=|\|/, "g");
-    const totalCharacters = data.content.replace(charRegex, "").length;
-    const langArr = data.languageArr;
-    const pos = data.pos;
-    const testFilePath = data.testFilePath;
+  const charRegex = new RegExp(
+    /\d|\n|\s|\-|\.|\,|\:|\;|\?|\!|\<|\>|\[|\]|\{|\}|\&|\=|\|/,
+    "g"
+  );
+  const totalCharacters = data.content.replace(charRegex, "").length;
+  const langArr = data.languageArr;
+  const pos = data.pos;
+  const testFile = data.testFile;
 
-    const secondLanguage = langArr.reduce((acc, val) => {
-        if (acc.name === fileInfo.language) return val;
-        if (val.name === fileInfo.language) return acc;
+  const secondLanguage = langArr.reduce((acc, val) => {
+    if (acc.name === fileInfo.language) return val;
+    if (val.name === fileInfo.language) return acc;
 
-        return acc.count >= val.count ? acc : val;
-    });
+    return acc.count >= val.count ? acc : val;
+  });
 
-    const languageRatio = langArr[pos].count / (secondLanguage.count + langArr[pos].count);
-    const characterWordRatio = langArr[pos].count / totalCharacters;
+  const languageRatio =
+    langArr[pos].count / (secondLanguage.count + langArr[pos].count);
+  const characterWordRatio = langArr[pos].count / totalCharacters;
 
-    let lowerLimit = null;
-    let upperLimit = null;
-    const multiplier = 0.8;
+  let lowerLimit = null;
+  let upperLimit = null;
+  const multiplier = 0.8;
 
-    if (data.utf8) {
-        lowerLimit = langArr[pos].utfFrequency ? langArr[pos].utfFrequency.low * multiplier : null;
-        upperLimit = langArr[pos].utfFrequency ? (langArr[pos].utfFrequency.low + langArr[pos].utfFrequency.high) / 2 : null;
+  if (fileInfo.encoding === "UTF-8" || fileInfo.encoding === "UTF-16LE") {
+    lowerLimit = langArr[pos].utfFrequency
+      ? langArr[pos].utfFrequency.low * multiplier
+      : null;
+    upperLimit = langArr[pos].utfFrequency
+      ? (langArr[pos].utfFrequency.low + langArr[pos].utfFrequency.high) / 2
+      : null;
+  } else {
+    lowerLimit = langArr[pos].isoFrequency
+      ? langArr[pos].isoFrequency.low * multiplier
+      : null;
+    upperLimit = langArr[pos].isoFrequency
+      ? (langArr[pos].isoFrequency.low + langArr[pos].isoFrequency.high) / 2
+      : null;
+  }
 
-    } else {
-        lowerLimit = langArr[pos].isoFrequency ? langArr[pos].isoFrequency.low * multiplier : null;
-        upperLimit = langArr[pos].isoFrequency ? (langArr[pos].isoFrequency.low + langArr[pos].isoFrequency.high) / 2 : null;
-    }
+  let confidenceScore;
 
-    let confidenceScore;
+  if (!lowerLimit || !upperLimit) {
+    confidenceScore = null;
+  } else if (characterWordRatio >= upperLimit) {
+    confidenceScore = 1;
+  } else if (characterWordRatio > lowerLimit) {
+    const range = upperLimit - lowerLimit;
+    const surplus = characterWordRatio - lowerLimit;
+    const confidenceRaisePercentage = surplus / range;
+    const confidenceRaise = (1 - languageRatio) * confidenceRaisePercentage;
+    confidenceScore = Number((languageRatio + confidenceRaise).toFixed(2));
+  } else {
+    confidenceScore = Number(
+      (languageRatio * (characterWordRatio / lowerLimit)).toFixed(2)
+    );
+  }
 
-    if (!lowerLimit || !upperLimit) {
-        confidenceScore = null;
+  // If the test script is running
+  if (testFile) {
+    return {
+      name: testFile.substr(testFile.lastIndexOf("/") + 1),
+      path: testFile,
+      encoding: fileInfo.encoding,
+      language: fileInfo.language,
+      languageConfidence: confidenceScore,
+      ratio: Number(languageRatio.toFixed(2)),
+      count: langArr[pos].count,
+      totalCharacters: totalCharacters,
+      characterWordRatio: characterWordRatio.toFixed(6),
+      secondLanguage: {
+        name: secondLanguage.name,
+        count: secondLanguage.count,
+      },
+    };
+  }
 
-    } else if (characterWordRatio >= upperLimit) {
-        confidenceScore = 1;
-
-    } else if (characterWordRatio > lowerLimit) {
-        const range = upperLimit - lowerLimit;
-        const surplus = characterWordRatio - lowerLimit;
-        const confidenceRaisePercentage = surplus / range;
-        const confidenceRaise = (1 - languageRatio) * confidenceRaisePercentage;
-        confidenceScore = Number((languageRatio + confidenceRaise).toFixed(2));
-
-    } else {
-        confidenceScore = Number((languageRatio * (characterWordRatio / lowerLimit)).toFixed(2));
-    }
-
-    // If the test script is running
-    if (testFilePath) {
-        return {
-            name: testFilePath.substr(testFilePath.lastIndexOf('/') + 1),
-            path: testFilePath,
-            language: fileInfo.language,
-            utf8: data.utf8,
-            confidence: confidenceScore,
-            ratio: Number(languageRatio.toFixed(2)),
-            count: langArr[pos].count,
-            totalCharacters: totalCharacters,
-            characterWordRatio: characterWordRatio.toFixed(6),
-            secondLanguage: {
-                name: secondLanguage.name,
-                count: secondLanguage.count
-            }
-        };
-    }
-
-    return confidenceScore;
-};
+  return confidenceScore;
+};
diff --git a/src/components/processing-content/countAllMatches.js b/src/components/processing-content/countAllMatches.js
@@ -1,29 +1,31 @@
-module.exports = (data, languageArr) => {
-    const newLanguageArr = [];
+const languageArr = require("../../config/languageObject.js");
 
-    // Cloning the language array and making sure that "count" has no reference to "languageArr"!
-    languageArr.forEach((obj) => {
-        const updatedLangObj = {};
-        Object.keys(obj).forEach(key => {
-            if (key !== "count") {
-                updatedLangObj[key] = obj[key];
-            } else {
-                updatedLangObj.count = 0;
-            }
-        });
-        newLanguageArr.push(updatedLangObj);
+module.exports = (data, encoding) => {
+  const newLanguageArr = [];
+
+  // Cloning the language array and making sure that "count" has no reference to "languageArr"!
+  languageArr.forEach((obj) => {
+    const updatedLangObj = {};
+    Object.keys(obj).forEach((key) => {
+      if (key !== "count") {
+        updatedLangObj[key] = obj[key];
+      } else {
+        updatedLangObj.count = 0;
+      }
     });
+    newLanguageArr.push(updatedLangObj);
+  });
 
-    const regex = data.utf8 ? "utfRegex" : "isoRegex";
+  const regex = encoding ? "utfRegex" : "isoRegex";
 
-    // Populate the count property of our language array!
-    newLanguageArr.forEach(lang => {
-        if (lang[regex]) {
-            const matches = data.content.match(lang[regex]);
+  // Populating the count property of the language array
+  newLanguageArr.forEach((lang) => {
+    if (lang[regex]) {
+      const matches = data.content.match(lang[regex]);
 
-            if (matches) lang.count = matches.length;
-        }
-    });
+      if (matches) lang.count = matches.length;
+    }
+  });
 
-    return newLanguageArr;
-}
+  return newLanguageArr;
+};
diff --git a/src/config/byteOrderMarkObject.js b/src/config/byteOrderMarkObject.js
@@ -0,0 +1,46 @@
+module.exports = [
+  {
+    encoding: "UTF-EBCDIC",
+    regex: new RegExp("221 115 102 115"),
+  },
+  {
+    encoding: "GB-18030",
+    regex: new RegExp("132 49 149 51"),
+  },
+  {
+    encoding: "UTF-32LE",
+    regex: new RegExp("255 254 0 0"),
+  },
+  {
+    encoding: "UTF-32BE",
+    regex: new RegExp("0 0 254 255"),
+  },
+  {
+    encoding: "UTF-8",
+    regex: new RegExp("239 187 191"),
+  },
+  {
+    encoding: "UTF-7",
+    regex: new RegExp("43 47 118"),
+  },
+  {
+    encoding: "UTF-1",
+    regex: new RegExp("247 100 76"),
+  },
+  {
+    encoding: "SCSU",
+    regex: new RegExp("14 254 255"),
+  },
+  {
+    encoding: "BOCU-1",
+    regex: new RegExp("251 238 40"),
+  },
+  {
+    encoding: "UTF-16BE",
+    regex: new RegExp("254 255"),
+  },
+  {
+    encoding: "UTF-16LE",
+    regex: new RegExp("255 254"),
+  },
+];
diff --git a/src/config/languageObject.js b/src/config/languageObject.js
diff --git a/src/index-node.js b/src/index-node.js