/* * The Original Code is Mozilla Universal charset detector code. * * The Initial Developer of the Original Code is * Netscape Communications Corporation. * Portions created by the Initial Developer are Copyright (C) 2001 * the Initial Developer. All Rights Reserved. * * Contributor(s): * António Afonso (antonio.afonso gmail.com) - port to JavaScript * Mark Pilgrim - port to Python * Shy Shalom - original C code * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; either * version 2.1 of the License, or (at your option) any later version. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; if not, write to the Free Software * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA * 02110-1301 USA */ var CodingStateMachine = require('./codingstatemachine'); var CharSetProber = require('./charsetprober'); var constants = require('./constants'); var UTF8SMModel = require('./mbcssm/utf8'); function UTF8Prober() { CharSetProber.apply(this); var ONE_CHAR_PROB = 0.5; var self = this; function init() { self._mCodingSM = new CodingStateMachine(UTF8SMModel); self.reset(); } this.reset = function() { UTF8Prober.prototype.reset.apply(this); this._mCodingSM.reset(); this._mNumOfMBChar = 0; } this.getCharsetName = function() { return "UTF-8"; } this.feed = function(aBuf) { for( var i = 0, c; i < aBuf.length; i++ ) { c = aBuf[i]; var codingState = this._mCodingSM.nextState(c); if( codingState == constants.error ) { this._mState = constants.notMe; break; } else if( codingState == constants.itsMe ) { this._mState = constants.foundIt; break; } else if( codingState == constants.start ) { if( this._mCodingSM.getCurrentCharLen() >= 2 ) { this._mNumOfMBChar++; } } } if( this.getState() == constants.detecting ) { if( this.getConfidence() > constants.SHORTCUT_THRESHOLD ) { this._mState = constants.foundIt; } } return this.getState(); } this.getConfidence = function() { var unlike = 0.99; if( this._mNumOfMBChar < 6 ) { for( var i = 0; i < this._mNumOfMBChar; i++ ) { unlike *= ONE_CHAR_PROB; } return 1 - unlike; } else { return unlike; } } init(); } UTF8Prober.prototype = new CharSetProber(); module.exports = UTF8Prober;