Main Page | Class Hierarchy | Class List | Directories | File List | Class Members | File Members

TranslationWordList.php

Go to the documentation of this file.
00001 <?php
00002 /***************************************************************************
00003  *   Copyright (C) 2005 by Ferenc Veres   *
00004  *   lion@netngine.hu   *
00005  *                                                                         *
00006  *   This program is free software; you can redistribute it and/or modify  *
00007  *   it under the terms of the GNU General Public License as published by  *
00008  *   the Free Software Foundation; either version 2 of the License, or     *
00009  *   (at your option) any later version.                                   *
00010  *                                                                         *
00011  *   This program is distributed in the hope that it will be useful,       *
00012  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
00013  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
00014  *   GNU General Public License for more details.                          *
00015  *                                                                         *
00016  *   You should have received a copy of the GNU General Public License     *
00017  *   along with this program; if not, write to the                         *
00018  *   Free Software Foundation, Inc.,                                       *
00019  *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
00020  ***************************************************************************/
00021 include_once('TranslationWord.php');
00022 
00033 class TranslationWordList implements Iterator
00034 {
00035         private $allWords;              
00036         private $fakeLength;    
00038         private $DB;                    
00039         private $Tables;                
00040         private $currIndex;             
00042         public static $WordsCache; 
00052         function __construct($text = "")
00053         {
00054                 // Init DB references
00055                 $this->DB = GetDbConn();
00056                 $this->Tables = GetTables();
00057 
00058                 $this->currIndex = 0;
00059                 $this->fakeLength = 0;
00060                 unset($this->allWords);
00061 
00062                 // If text passed, parse it and fill the words arrays (also adds to WordIndex if necessary)
00063                 if($text != "")
00064                 {
00065                         $this->SplitToWords($text);
00066                 }
00067         }
00068 
00069         /**** Iterator interface ****/
00070 
00074         public function rewind()
00075         {
00076                 $this->currIndex = 0;
00077         }
00081         public function hasMore()
00082         {
00083                 return count($this->allWords) > $this->currIndex;
00084         }
00088         public function key()
00089         {
00090                 return $this->currIndex;
00091         }
00095         public function current()
00096         {
00097                 return $this->allWords[$this->currIndex];
00098         }
00102         public function next()
00103         {
00104                 $this->currIndex++;
00105         }
00109         public function valid()
00110         {
00111         if(count($this->allWords) > $this->currIndex)
00112                 {
00113                         return true;
00114                 }
00115                 return false;
00116         }
00117 
00118         /* Static initialization methods */
00119 
00126         public static function InitWordsCache()
00127         {
00128                 $Tables = GetTables();
00129                 $DB = GetDbConn();
00130                 
00131                 // Fetch already indexed words and their ID
00132                 $rs = $DB->Execute("SELECT Word, ID FROM ".$Tables["WordIndex"]);
00133         
00134                 if(!$rs)
00135                 {
00136                         die("DB Error:". $this->DB->ErrorMsg());
00137                 }
00138         
00139                 while($oldWord = $rs->FetchRow())
00140                 {
00141                         // Already existing words get an ID now
00142                         TranslationWordList::$WordsCache[$oldWord["Word"]] = $oldWord["ID"];
00143                 }
00144         }
00145 
00146         /**** Add words and properties ****/
00147         
00160         function AddWord($word, $count = 1)
00161         {
00162                 // Get a reference to this word in allWords if already added
00163                 $w =& $this->FindByName($word);
00164                 if($w == null)
00165                 {
00166                         // First occurence(s) in this sentence, create with emtpy ID
00167                         $w = new TranslationWord();
00168                         $w->SetWord($word);
00169                         $w->SetCount($count);
00170                         $this->allWords[] = $w;
00171                 }
00172                 else
00173                 {
00174                         // Not first occurence(s), count it
00175                         $w->SetCount($w->GetCount() + $count);
00176                 }
00177         }
00178 
00192         function AddID($id, $count = 1)
00193         {
00194                 // Get a reference to this word in allWords if already added
00195                 $w =& $this->FindByID($id);
00196                 if($w == null)
00197                 {
00198                         // First occurence(s) in this sentence, create with emtpy Name
00199                         $w = new TranslationWord();
00200                         $w->SetID($id);
00201                         $w->SetCount($count);
00202                         $this->allWords[] = $w;
00203                 }
00204                 else
00205                 {
00206                         // Not first occurence(s), count it
00207                         $w->SetCount($w->GetCount() + $count);
00208                 }
00209         }
00210 
00211 
00221         function SetIDByName($word, $id)
00222         {
00223                 $w =& $this->FindByName($word);
00224                 if($w == null)
00225                 {
00226                         die("WORDS: SetIDByName to a not existing word: $word, $id.\n");
00227                 }
00228 
00229                 $w->SetID($id);
00230         }
00231         
00241         function SetFakeLength($length)
00242         {
00243                 $this->fakeLength = $length;
00244         }
00245 
00256         private function SplitToWords($text)
00257         {
00258                 // Clear previous items
00259                 unset($this->allWords);
00260                 $this->SetFakeLength(0);
00261                 
00262                 // Remove linebreaks
00263                 $text = str_replace("\n", " ", $text);
00264                 $text = str_replace("\r", " ", $text);
00265                 
00266                 // Remove some HTML emphasis from the words
00267                 $text = str_replace("<strong>", "", $text);
00268                 $text = str_replace("</strong>", "", $text);
00269                 $text = str_replace("<b>", "", $text);
00270                 $text = str_replace("</b>", "", $text);
00271                 $text = str_replace("<i>", "", $text);
00272                 $text = str_replace("</i>", "", $text);
00273                 $text = str_replace("<em>", "", $text);
00274                 $text = str_replace("</em>", "", $text);
00275                 $text = str_replace("&nbsp;", " ", $text);
00276 
00277                 // Split to words
00278                 $wordList = split('[ \.,()!\?:;=|*]', $text);
00279                 
00280                 // Filter non-words
00281                 foreach($wordList as $word)
00282                 {
00283                         // Remove trash remained with split
00284                         $word = trim($word, " \t\n\r:[]#'\"$+-_<>/");
00285 
00286                         // Nonempty, nonnumbers
00287                         if($word != "" && !is_numeric($word))
00288                         {
00289                                 $this->AddWord(strtolower($word));
00290                         }
00291                 }
00292 
00293                 // Add to word index database and get indexes for already indexed
00294                 if($this->SumCount() != 0)
00295                 {
00296                         $this->AddToWordIndex();
00297                 }
00298         }
00299 
00300         /**** Get words and properties ****/
00301 
00310         function &FindByName($word)
00311         {
00312                 if(empty($this->allWords))
00313                 {
00314                         $null = null;
00315                         return $null;
00316                 }
00317                 
00318                 foreach($this->allWords as $w)
00319                 {
00320                         if($w->GetWord() == $word)
00321                         {
00322                                 return $w;
00323                         }
00324                 }
00325 
00326                 $null = null;
00327                 return $null;
00328         }
00329 
00338         function &FindByID($id)
00339         {
00340                 if(empty($this->allWords))
00341                 {
00342                         $null = null;
00343                         return $null;
00344                 }
00345                 
00346                 foreach($this->allWords as $w)
00347                 {
00348                         if($w->GetID() == $id)
00349                         {
00350                                 return $w;
00351                         }
00352                 }
00353 
00354                 $null = null;
00355                 return $null;
00356         }
00357 
00358         
00368         function SumCount()
00369         {
00370                 // Some sentences are not completely loaded but can fake their
00371                 // original length
00372                 if($this->fakeLength <> 0)
00373                 {
00374                         return $this->fakeLength;
00375                 }
00376 
00377                 // Not loaded sentences are 0 long
00378                 if(empty($this->allWords))
00379                 {
00380                         return 0;
00381                 }
00382 
00383                 // Summarize all words occurences otherwise
00384                 $count = 0;
00385                 foreach($this->allWords as $w)
00386                 {
00387                         $count += $w->GetCount();
00388                 }
00389 
00390                 return $count;
00391         }
00392 
00393         /**** Compare ****/
00394 
00406         function CompareByID($otherWords)
00407         {
00408                 // Max points is the sum of all words from both sentences
00409                 $max = $this->SumCount() + $otherWords->SumCount();
00410                 
00411                 $points = 0;
00412                 
00413                 // Loop on all our own words
00414                 foreach($this->allWords as $w)
00415                 {
00416                         //This word exists in the other sentence too?
00417                         $w2 = $otherWords->FindByID($w->GetID());
00418                         if($w2 != null)
00419                         {
00420                                 // Yes. common word. (each common word gets 2 points)
00421                                 
00422                                 // Same number of occurences?
00423                                 if($w2->GetCount() == $w->GetCount())
00424                                 {
00425                                         $points += $w->GetCount() * 2;
00426                                 }
00427                                 else
00428                                 {
00429                                         $points += abs($w2->GetCount() - $w->GetCount()) * 2;
00430                                 }
00431                         }
00432                 }
00433 
00434                 // Return match percentage
00435 
00436                 return floor((100/$max) * $points);
00437         }
00438 
00439         /**** Database stuff ****/
00440 
00450         private function AddToWordIndex()
00451         {
00452                 // Get references to cached data (also inits the cache if necessary)
00453                 
00454                 // Add all new words to the index ($w is reference!)
00455                 foreach($this->allWords as &$w)
00456                 {
00457                         // Not yet existing word?
00458                         if(!empty(TranslationWordList::$WordsCache[$w->GetWord()]))
00459                         {
00460                                 // Have word
00461                                 $this->SetIDByName($w->GetWord(), TranslationWordList::$WordsCache[$w->GetWord()]);
00462                         }
00463                         else
00464                         {
00465                                 // Add new word to the index and get it's ID
00466                                 $ok = $this->DB->Execute("INSERT INTO ".$this->Tables["WordIndex"]." (Word) VALUES ('".fixstr($w->GetWord())."')");
00467 
00468                                 if(!$ok)
00469                                 {
00470                                         die("DB Error:". $this->DB->ErrorMsg()." Line: ".__LINE__);
00471                                 }
00472                                 if(!$this->DB->Insert_ID())
00473                                 {
00474                                         die("DB Error: Insert_ID not supported. Line: ".__LINE__);
00475                                 }
00476                                 
00477                                 // Store the new ID
00478                                 $w->SetID($this->DB->Insert_ID());
00479                                 TranslationWordList::$WordsCache[$w->GetWord()] = $w->GetID();
00480                         }
00481                 }
00482 
00483                 // By now all Words have a name, ID and count for this Sentence
00484         }
00485 
00486 }
00487 
00488 
00489 ?>

Generated on Sat Apr 22 16:49:54 2006 for XarayaTranslationMemory by  doxygen 1.4.4