XarayaTranslationMemory  1.6
TranslationWordList.php
Go to the documentation of this file.
1 <?php
2 /***************************************************************************
3  * Copyright (C) 2005-2020 by Ferenc Veres *
4  * lion@netngine.hu *
5  * *
6  * This program is free software; you can redistribute it and/or modify *
7  * it under the terms of the GNU General Public License as published by *
8  * the Free Software Foundation; either version 3 of the License, or *
9  * (at your option) any later version. *
10  * *
11  * This program is distributed in the hope that it will be useful, *
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of *
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
14  * GNU General Public License for more details. *
15  * *
16  * You should have received a copy of the GNU General Public License *
17  * along with this program. If not, see <http://www.gnu.org/licenses/>. *
18  ***************************************************************************/
19 include_once('TranslationWord.php');
20 
31 class TranslationWordList implements Iterator
32 {
33  private $allWords;
34  private $fakeLength;
36  private $DB;
37  private $Tables;
38  private $currIndex;
40  public static $WordsCache;
50  function __construct($text = "")
51  {
52  // Init DB references
53  $this->DB = GetDbConn();
54  $this->Tables = GetTables();
55 
56  $this->currIndex = 0;
57  $this->fakeLength = 0;
58  unset($this->allWords);
59 
60  // If text passed, parse it and fill the words arrays (also adds to WordIndex if necessary)
61  if($text != "")
62  {
63  $this->SplitToWords($text);
64  }
65  }
66 
67  /**** Iterator interface ****/
68 
72  public function rewind()
73  {
74  $this->currIndex = 0;
75  }
79  public function hasMore()
80  {
81  return count($this->allWords) > $this->currIndex;
82  }
86  public function key()
87  {
88  return $this->currIndex;
89  }
93  public function current()
94  {
95  return $this->allWords[$this->currIndex];
96  }
100  public function next()
101  {
102  $this->currIndex++;
103  }
107  public function valid()
108  {
109  if(count($this->allWords) > $this->currIndex)
110  {
111  return true;
112  }
113  return false;
114  }
115 
116  /* Static initialization methods */
117 
124  public static function InitWordsCache()
125  {
126  $Tables = GetTables();
127  $DB = GetDbConn();
128 
129  // Fetch already indexed words and their ID
130  $rs = $DB->Execute("SELECT Word, ID FROM ".$Tables["WordIndex"]);
131 
132  if(!$rs)
133  {
134  die("DB Error:". $this->DB->ErrorMsg());
135  }
136 
137  while($oldWord = $rs->FetchRow())
138  {
139  // Already existing words get an ID now
140  TranslationWordList::$WordsCache[$oldWord["Word"]] = $oldWord["ID"];
141  }
142  }
143 
144  /**** Add words and properties ****/
145 
158  function AddWord($word, $count = 1)
159  {
160  // Get a reference to this word in allWords if already added
161  $w =& $this->FindByName($word);
162  if($w == null)
163  {
164  // First occurence(s) in this sentence, create with emtpy ID
165  $w = new TranslationWord();
166  $w->SetWord($word);
167  $w->SetCount($count);
168  $this->allWords[] = $w;
169  }
170  else
171  {
172  // Not first occurence(s), count it
173  $w->SetCount($w->GetCount() + $count);
174  }
175  }
176 
190  function AddID($id, $count = 1)
191  {
192  // Get a reference to this word in allWords if already added
193  $w =& $this->FindByID($id);
194  if($w == null)
195  {
196  // First occurence(s) in this sentence, create with emtpy Name
197  $w = new TranslationWord();
198  $w->SetID($id);
199  $w->SetCount($count);
200  $this->allWords[] = $w;
201  }
202  else
203  {
204  // Not first occurence(s), count it
205  $w->SetCount($w->GetCount() + $count);
206  }
207  }
208 
209 
219  function SetIDByName($word, $id)
220  {
221  $w =& $this->FindByName($word);
222  if($w == null)
223  {
224  die("WORDS: SetIDByName to a not existing word: $word, $id.\n");
225  }
226 
227  $w->SetID($id);
228  }
229 
239  function SetFakeLength($length)
240  {
241  $this->fakeLength = $length;
242  }
243 
254  private function SplitToWords($text)
255  {
256  // Clear previous items
257  unset($this->allWords);
258  $this->SetFakeLength(0);
259 
260  // Remove linebreaks
261  $text = str_replace("\n", " ", $text);
262  $text = str_replace("\r", " ", $text);
263 
264  // Remove some HTML emphasis from the words
265  $text = str_replace("<strong>", "", $text);
266  $text = str_replace("</strong>", "", $text);
267  $text = str_replace("<b>", "", $text);
268  $text = str_replace("</b>", "", $text);
269  $text = str_replace("<i>", "", $text);
270  $text = str_replace("</i>", "", $text);
271  $text = str_replace("<em>", "", $text);
272  $text = str_replace("</em>", "", $text);
273  $text = str_replace("&nbsp;", " ", $text);
274 
275  // Split to words
276  $wordList = preg_split('/[- \.,()!\?\/\\<>_:;=|\\*]/', $text, -1, PREG_SPLIT_NO_EMPTY);
277 
278  // Filter non-words
279  foreach($wordList as $word)
280  {
281  // Remove trash remained with split
282  $word = trim($word, " \t\n\r:[]#'\"$+-_<>/");
283 
284  // Nonempty, nonnumbers
285  if($word != "" && !is_numeric($word))
286  {
287  $this->AddWord(strtolower($word));
288  }
289  }
290 
291  // Add to word index database and get indexes for already indexed
292  if($this->SumCount() != 0)
293  {
294  $this->AddToWordIndex();
295  }
296  }
297 
298  /**** Get words and properties ****/
299 
308  function &FindByName($word)
309  {
310  if(empty($this->allWords))
311  {
312  $null = null;
313  return $null;
314  }
315 
316  foreach($this->allWords as $w)
317  {
318  if($w->GetWord() == $word)
319  {
320  return $w;
321  }
322  }
323 
324  $null = null;
325  return $null;
326  }
327 
336  function &FindByID($id)
337  {
338  if(empty($this->allWords))
339  {
340  $null = null;
341  return $null;
342  }
343 
344  foreach($this->allWords as $w)
345  {
346  if($w->GetID() == $id)
347  {
348  return $w;
349  }
350  }
351 
352  $null = null;
353  return $null;
354  }
355 
356 
366  function SumCount()
367  {
368  // Some sentences are not completely loaded but can fake their
369  // original length
370  if($this->fakeLength <> 0)
371  {
372  return $this->fakeLength;
373  }
374 
375  // Not loaded sentences are 0 long
376  if(empty($this->allWords))
377  {
378  return 0;
379  }
380 
381  // Summarize all words occurences otherwise
382  $count = 0;
383  foreach($this->allWords as $w)
384  {
385  $count += $w->GetCount();
386  }
387 
388  return $count;
389  }
390 
391  /**** Compare ****/
392 
404  function CompareByID($otherWords)
405  {
406  // Max points is the sum of all words from both sentences
407  $max = $this->SumCount() + $otherWords->SumCount();
408 
409  $points = 0;
410 
411  // Loop on all our own words
412  foreach($this->allWords as $w)
413  {
414  //This word exists in the other sentence too?
415  $w2 = $otherWords->FindByID($w->GetID());
416  if($w2 != null)
417  {
418  // Yes. common word. (each common word gets 2 points)
419 
420  // Same number of occurences?
421  if($w2->GetCount() == $w->GetCount())
422  {
423  $points += $w->GetCount() * 2;
424  }
425  else
426  {
427  $points += abs($w2->GetCount() - $w->GetCount()) * 2;
428  }
429  }
430  }
431 
432  // Return match percentage
433 
434  return floor((100/$max) * $points);
435  }
436 
437  /**** Database stuff ****/
438 
448  private function AddToWordIndex()
449  {
450  // Get references to cached data (also inits the cache if necessary)
451 
452  // Add all new words to the index ($w is reference!)
453  foreach($this->allWords as &$w)
454  {
455  // Not yet existing word?
456  if(!empty(TranslationWordList::$WordsCache[$w->GetWord()]))
457  {
458  // Have word
459  $this->SetIDByName($w->GetWord(), TranslationWordList::$WordsCache[$w->GetWord()]);
460  }
461  else
462  {
463  // Add new word to the index and get it's ID
464  $ok = $this->DB->Execute("INSERT INTO ".$this->Tables["WordIndex"]." (Word) VALUES ('".fixstr($w->GetWord())."')");
465 
466  if(!$ok)
467  {
468  die("DB Error:". $this->DB->ErrorMsg()." Line: ".__LINE__);
469  }
470  if(!$this->DB->Insert_ID())
471  {
472  die("DB Error: Insert_ID not supported. Line: ".__LINE__);
473  }
474 
475  // Store the new ID
476  $w->SetID($this->DB->Insert_ID());
477  TranslationWordList::$WordsCache[$w->GetWord()] = $w->GetID();
478  }
479  }
480 
481  // By now all Words have a name, ID and count for this Sentence
482  }
483 
484 }
485 
486 
487 ?>
fixstr($text)
Definition: Common.php:36
& GetTables()
Definition: Common.php:71
& GetDbConn()
Definition: Common.php:61
AddWord($word, $count=1)