jschardet

2025-12-24 11:45:40 +08:00 · 2019-06-11 17:05:47 +08:00
parent c31ad00747
commit cce2bb6d1f
57 changed files with 18273 additions and 0 deletions
--- a/node_modules/jschardet/src/chardistribution.js
+++ b/node_modules/jschardet/src/chardistribution.js
@@ -0,0 +1,301 @@
+/*
+ * The Original Code is Mozilla Universal charset detector code.
+ *
+ * The Initial Developer of the Original Code is
+ * Netscape Communications Corporation.
+ * Portions created by the Initial Developer are Copyright (C) 2001
+ * the Initial Developer. All Rights Reserved.
+ *
+ * Contributor(s):
+ *   António Afonso (antonio.afonso gmail.com) - port to JavaScript
+ *   Mark Pilgrim - port to Python
+ *   Shy Shalom - original C code
+ *
+ * This library is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU Lesser General Public
+ * License as published by the Free Software Foundation; either
+ * version 2.1 of the License, or (at your option) any later version.
+ *
+ * This library is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * Lesser General Public License for more details.
+ *
+ * You should have received a copy of the GNU Lesser General Public
+ * License along with this library; if not, write to the Free Software
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+ * 02110-1301  USA
+ */
+
+var jisfreq = require('./jisfreq');
+var euctwfreq = require('./euctwfreq');
+var euckrfreq = require('./euckrfreq');
+var gb2312freq = require('./gb2312freq');
+var big5freq = require('./big5freq');
+
+function CharDistributionAnalysis() {
+    var ENOUGH_DATA_THRESHOLD = 1024;
+    var SURE_YES = 0.99;
+    var SURE_NO = 0.01;
+    var MINIMUM_DATA_THRESHOLD = 3;
+
+    var self = this;
+
+    function init() {
+        self._mCharToFreqOrder = null; // Mapping table to get frequency order from char order (get from GetOrder())
+        self._mTableSize = null; // Size of above table
+        self._mTypicalDistributionRatio = null; // This is a constant value which varies from language to language, used in calculating confidence.  See http://www.mozilla.org/projects/intl/UniversalCharsetDetection.html for further detail.
+        self.reset();
+    }
+
+    /**
+     * reset analyser, clear any state
+     */
+    this.reset = function() {
+        this._mDone = false; // If this flag is set to constants.True, detection is done and conclusion has been made
+        this._mTotalChars = 0; // Total characters encountered
+        this._mFreqChars = 0; // The number of characters whose frequency order is less than 512
+    }
+
+    /**
+     * feed a character with known length
+     */
+    this.feed = function(aStr, aCharLen) {
+        if( aCharLen == 2 ) {
+            // we only care about 2-bytes character in our distribution analysis
+            var order = this.getOrder(aStr);
+        } else {
+            order = -1;
+        }
+        if( order >= 0 ) {
+            this._mTotalChars++;
+            // order is valid
+            if( order < this._mTableSize ) {
+                if( 512 > this._mCharToFreqOrder[order] ) {
+                    this._mFreqChars++;
+                }
+            }
+        }
+    }
+
+    /**
+     * return confidence based on existing data
+     */
+    this.getConfidence = function() {
+        // if we didn't receive any character in our consideration range, return negative answer
+        if( this._mTotalChars <= 0 || this._mFreqChars <= MINIMUM_DATA_THRESHOLD) {
+            return SURE_NO;
+        }
+        if( this._mTotalChars != this._mFreqChars ) {
+            var r = this._mFreqChars / ((this._mTotalChars - this._mFreqChars) * this._mTypicalDistributionRatio);
+            if( r < SURE_YES ) {
+                return r;
+            }
+        }
+
+        // normalize confidence (we don't want to be 100% sure)
+        return SURE_YES;
+    }
+
+    this.gotEnoughData = function() {
+        // It is not necessary to receive all data to draw conclusion. For charset detection,
+        // certain amount of data is enough
+        return this._mTotalChars > ENOUGH_DATA_THRESHOLD;
+    }
+
+    this.getOrder = function(aStr) {
+        // We do not handle characters based on the original encoding string, but
+        // convert this encoding string to a number, here called order.
+        // This allows multiple encodings of a language to share one frequency table.
+        return -1;
+    }
+
+    init();
+}
+
+exports.CharDistributionAnalysis = CharDistributionAnalysis
+
+function EUCTWDistributionAnalysis() {
+    CharDistributionAnalysis.apply(this);
+
+    var self = this;
+
+    function init() {
+        self._mCharToFreqOrder = euctwfreq.EUCTWCharToFreqOrder;
+        self._mTableSize = euctwfreq.EUCTW_TABLE_SIZE;
+        self._mTypicalDistributionRatio = euctwfreq.EUCTW_TYPICAL_DISTRIBUTION_RATIO;
+    }
+
+    this.getOrder = function(aStr) {
+        // for euc-TW encoding, we are interested
+        //   first  byte range: 0xc4 -- 0xfe
+        //   second byte range: 0xa1 -- 0xfe
+        // no validation needed here. State machine has done that
+        if( aStr.charCodeAt(0) >= 0xC4 ) {
+            return 94 * (aStr.charCodeAt(0) - 0xC4) + aStr.charCodeAt(1) - 0xA1;
+        } else {
+            return -1;
+        }
+    }
+
+    init();
+}
+EUCTWDistributionAnalysis.prototype = new CharDistributionAnalysis();
+
+exports.EUCTWDistributionAnalysis = EUCTWDistributionAnalysis
+
+function EUCKRDistributionAnalysis() {
+    CharDistributionAnalysis.apply(this);
+
+    var self = this;
+
+    function init() {
+        self._mCharToFreqOrder = euckrfreq.EUCKRCharToFreqOrder;
+        self._mTableSize = euckrfreq.EUCKR_TABLE_SIZE;
+        self._mTypicalDistributionRatio = euckrfreq.EUCKR_TYPICAL_DISTRIBUTION_RATIO;
+    }
+
+    this.getOrder = function(aStr) {
+        // for euc-KR encoding, we are interested
+        //   first  byte range: 0xb0 -- 0xfe
+        //   second byte range: 0xa1 -- 0xfe
+        // no validation needed here. State machine has done that
+        if( aStr.charCodeAt(0) >= 0xB0 ) {
+            return 94 * (aStr.charCodeAt(0) - 0xB0) + aStr.charCodeAt(1) - 0xA1;
+        } else {
+            return -1;
+        }
+    }
+
+    init();
+}
+EUCKRDistributionAnalysis.prototype = new CharDistributionAnalysis();
+
+exports.EUCKRDistributionAnalysis = EUCKRDistributionAnalysis
+
+function GB2312DistributionAnalysis() {
+    CharDistributionAnalysis.apply(this);
+
+    var self = this;
+
+    function init() {
+        self._mCharToFreqOrder = gb2312freq.GB2312CharToFreqOrder;
+        self._mTableSize = gb2312freq.GB2312_TABLE_SIZE;
+        self._mTypicalDistributionRatio = gb2312freq.GB2312_TYPICAL_DISTRIBUTION_RATIO;
+    }
+
+    this.getOrder = function(aStr) {
+        // for GB2312 encoding, we are interested
+        //  first  byte range: 0xb0 -- 0xfe
+        //  second byte range: 0xa1 -- 0xfe
+        // no validation needed here. State machine has done that
+        if( aStr.charCodeAt(0) >= 0xB0 && aStr.charCodeAt(1) >= 0xA1 ) {
+            return 94 * (aStr.charCodeAt(0) - 0xB0) + aStr.charCodeAt(1) - 0xA1;
+        } else {
+            return -1;
+        }
+    }
+
+    init();
+}
+GB2312DistributionAnalysis.prototype = new CharDistributionAnalysis();
+
+exports.GB2312DistributionAnalysis = GB2312DistributionAnalysis
+
+function Big5DistributionAnalysis() {
+    CharDistributionAnalysis.apply(this);
+
+    var self = this;
+
+    function init() {
+        self._mCharToFreqOrder = big5freq.Big5CharToFreqOrder;
+        self._mTableSize = big5freq.BIG5_TABLE_SIZE;
+        self._mTypicalDistributionRatio = big5freq.BIG5_TYPICAL_DISTRIBUTION_RATIO;
+    }
+
+    this.getOrder = function(aStr) {
+        // for big5 encoding, we are interested
+        //   first  byte range: 0xa4 -- 0xfe
+        //   second byte range: 0x40 -- 0x7e , 0xa1 -- 0xfe
+        // no validation needed here. State machine has done that
+        if( aStr.charCodeAt(0) >= 0xA4 ) {
+            if( aStr.charCodeAt(1) >= 0xA1 ) {
+                return 157 * (aStr.charCodeAt(0) - 0xA4) + aStr.charCodeAt(1) - 0xA1 + 63;
+            } else {
+                return 157 * (aStr.charCodeAt(0) - 0xA4) + aStr.charCodeAt(1) - 0x40;
+            }
+        } else {
+            return -1;
+        }
+    }
+
+    init();
+}
+Big5DistributionAnalysis.prototype = new CharDistributionAnalysis();
+
+exports.Big5DistributionAnalysis = Big5DistributionAnalysis
+
+function SJISDistributionAnalysis() {
+    CharDistributionAnalysis.apply(this);
+
+    var self = this;
+
+    function init() {
+        self._mCharToFreqOrder = jisfreq.JISCharToFreqOrder;
+        self._mTableSize = jisfreq.JIS_TABLE_SIZE;
+        self._mTypicalDistributionRatio = jisfreq.JIS_TYPICAL_DISTRIBUTION_RATIO;
+    }
+
+    this.getOrder = function(aStr) {
+        // for sjis encoding, we are interested
+        //   first  byte range: 0x81 -- 0x9f , 0xe0 -- 0xef
+        //   second byte range: 0x40 -- 0x7e,  0x80 -- 0xfc
+        // no validation needed here. State machine has done that
+        if( aStr.charCodeAt(0) >= 0x81 && aStr.charCodeAt(0) <= 0x9F ) {
+            var order = 188 * (aStr.charCodeAt(0) - 0x81);
+        } else if( aStr.charCodeAt(0) >= 0xE0 && aStr.charCodeAt(0) <= 0xEF ) {
+            order = 188 * (aStr.charCodeAt(0) - 0xE0 + 31);
+        } else {
+            return -1;
+        }
+        order += aStr.charCodeAt(1) - 0x40;
+        if( aStr.charCodeAt(1) < 0x40 || aStr.charCodeAt(1) === 0x7F || aStr.charCodeAt(1) > 0xFC) {
+            order = -1;
+        }
+        return order;
+    }
+
+    init();
+}
+SJISDistributionAnalysis.prototype = new CharDistributionAnalysis();
+
+exports.SJISDistributionAnalysis = SJISDistributionAnalysis
+
+function EUCJPDistributionAnalysis() {
+    CharDistributionAnalysis.apply(this);
+
+    var self = this;
+
+    function init() {
+        self._mCharToFreqOrder = jisfreq.JISCharToFreqOrder;
+        self._mTableSize = jisfreq.JIS_TABLE_SIZE;
+        self._mTypicalDistributionRatio = jisfreq.JIS_TYPICAL_DISTRIBUTION_RATIO;
+    }
+
+    this.getOrder = function(aStr) {
+        // for euc-JP encoding, we are interested
+        //   first  byte range: 0xa0 -- 0xfe
+        //   second byte range: 0xa1 -- 0xfe
+        // no validation needed here. State machine has done that
+        if( aStr[0] >= "\xA0" ) {
+            return 94 * (aStr.charCodeAt(0) - 0xA1) + aStr.charCodeAt(1) - 0xA1;
+        } else {
+            return -1;
+        }
+    }
+
+    init();
+}
+EUCJPDistributionAnalysis.prototype = new CharDistributionAnalysis();
+
+exports.EUCJPDistributionAnalysis = EUCJPDistributionAnalysis