-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathphptextcat.class.php
More file actions
97 lines (84 loc) · 2.94 KB
/
phptextcat.class.php
File metadata and controls
97 lines (84 loc) · 2.94 KB
1
<?php/** * phptextcat.class.php Textcat main class * * PHP TextCat is a PHP implementation of the text categorization (TextCat) algorithm * presented in Cavnar, W. B. and J. M. Trenkle, `N-Gram-Based Text Categorization'. * Used here as a language guesser. * * Please, contribute ! * * @author Christophe Dri */ class PhpTextCat { const DEFAULT_DIR_LANGUAGES = 'languages/'; const DEFAULT_EXTENSION_LANGUAGES = '.lng'; protected $_languages = array(); protected $_languages_extension = self::DEFAULT_EXTENSION_LANGUAGES; protected $_languages_dir = self::DEFAULT_DIR_LANGUAGES; public function setLanguageExt($ext){ $this->_languages_extension = $ext; } public function setLanguageDir($dir){ $this->_languages_dir = $dir; } public function __construct($languages = null) { if(func_num_args() > 1) $languages = func_get_args(); if(!empty($languages)) $this->loadLanguages($languages); } /** * Loads a set of langages using array of strings or multimples arguments * * @param mixed $languages */ public function loadLanguages($languages){ if(func_num_args() > 1) $languages = func_get_args(); else if(!is_array($languages)) $languages = array((string) $languages); foreach ($languages as $lng) $this->loadLanguage($lng); } /** * Loads one langage defined by $lng string ( {$lng}.lng ) * * @param string $lng */ public function loadLanguage($lng){ // Sanitize language name $lng = preg_replace('/[^a-z0-9_-]/i', '', $lng); $filename = $this->_languages_dir . $lng . $this->_languages_extension; // Load languages N-grams if (file_exists($filename)) $this->_languages[$lng] = str_replace('_', ' ', file($filename, FILE_IGNORE_NEW_LINES)); else error_log("php-textcat language file [{$lng}] not found"); } /** * Gives a score for the text passed as an argument, depending on loaded langages. * * @param string $text * @return array Associative array : ( lng1 => score1, lng2 => score2, ... ) */ public function rateText($text) { // Attribute Ranks for languages depending on text words. $text = strtolower($text); $count = array(); foreach ($this->_languages as $lng => $ngrams) { $count[$lng] = 0; foreach ($ngrams as $ngram) { $count[$lng] += substr_count($text, $ngram); } } arsort($count); return $count; } /** * Returns the best score langage (en,fr,etc.) for the text passed as an argument. * * @param string $text * @return string Langage name */ public function guessLanguage($text) { $grades = array_keys($this->rateText($text)); return reset($grades); }}