Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
stopper.cpp
Go to the documentation of this file.
1 /******************************************************************************
2  ** Filename: stopper.c
3  ** Purpose: Stopping criteria for word classifier.
4  ** Author: Dan Johnson
5  ** History: Mon Apr 29 14:56:49 1991, DSJ, Created.
6  **
7  ** (c) Copyright Hewlett-Packard Company, 1988.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  ******************************************************************************/
18 
19 #include "stopper.h"
20 #include "matchdefs.h"
21 #include "callcpp.h"
22 #include "permute.h"
23 #include "danerror.h"
24 #include "const.h"
25 #include "efio.h"
26 #include "scanutils.h"
27 #include "unichar.h"
28 #include "params.h"
29 #include "dict.h"
30 #include "image.h"
31 #include "ccutil.h"
32 #include "ratngs.h"
33 #include "ambigs.h"
34 
35 #include <stdio.h>
36 #include <string.h>
37 #include <ctype.h>
38 #include <math.h>
39 #ifdef __UNIX__
40 #include <assert.h>
41 #endif
42 
43 #ifdef _MSC_VER
44 #pragma warning(disable:4244) // Conversion warnings
45 #pragma warning(disable:4800) // int/bool warnings
46 #endif
47 
48 /* these are kludges - add appropriate .h file later */
49 /* from adaptmatch.cpp */
50 #define MAX_WERD_SIZE 100
51 
52 typedef struct
53 {
55  float ChunkCertainty[MAX_NUM_CHUNKS];
58 
59 void DeleteViableChoiceStruct(void *vcs) {
60  delete (static_cast<VIABLE_CHOICE_STRUCT *>(vcs));
61 }
62 
63 #define BestCertainty(Choices) \
64  (((VIABLE_CHOICE) first_node (Choices))->Certainty)
65 
66 #define BestRating(Choices) (((VIABLE_CHOICE) first_node (Choices))->Rating)
67 
68 #define BestFactor(Choices) \
69  (((VIABLE_CHOICE) first_node (Choices))->AdjustFactor)
70 
74 // Returns -1 if the rating for Choice1 is less than the rating for Choice2,
75 // otherwise returns 1.
76 static int CmpChoiceRatings(void *arg1, // VIABLE_CHOICE Choice1
77  void *arg2) { // VIABLE_CHOICE Choice2
78  float R1, R2;
79  VIABLE_CHOICE Choice1 = (VIABLE_CHOICE) arg1;
80  VIABLE_CHOICE Choice2 = (VIABLE_CHOICE) arg2;
81  R1 = Choice1->Rating;
82  R2 = Choice2->Rating;
83  return (R1 < R2) ? -1 : 1;
84 }
85 
86 // Expands Choice and places the results in ExpandedChoice. The primary
87 // function of expansion is to create an two arrays, one which holds the
88 // corresponding certainty for each chunk in Choice, and one which holds
89 // the class for each chunk.
90 static void ExpandChoice(VIABLE_CHOICE Choice,
91  EXPANDED_CHOICE *ExpandedChoice) {
92  int i, j, Chunk;
93  ExpandedChoice->Choice = Choice;
94  for (i = 0, Chunk = 0; i < Choice->Length; i++)
95  for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++) {
96  ExpandedChoice->ChunkCertainty[Chunk] = Choice->Blob[i].Certainty;
97  ExpandedChoice->ChunkClass[Chunk] = Choice->Blob[i].Class;
98  }
99 }
100 
102  : Length(length) {
103  Blob = new CHAR_CHOICE[length];
104  blob_choices = NULL;
105 }
106 
108  Blob = NULL;
109  blob_choices = NULL;
110 }
111 
113  delete []Blob;
114  if (blob_choices) {
115  blob_choices->deep_clear();
116  delete blob_choices;
117  }
118 }
119 
121  const WERD_CHOICE &word_choice,
122  const PIECES_STATE &pieces_state,
123  const float certainties[],
124  FLOAT32 adjust_factor) {
125  this->Rating = word_choice.rating();
126  this->Certainty = word_choice.certainty();
127  this->AdjustFactor = adjust_factor;
128  this->ComposedFromCharFragments = false;
129  ASSERT_HOST(this->Length == word_choice.length());
130 
131  for (int i = 0, bw_idx = 0; i < word_choice.length(); i++, bw_idx++) {
132  int blob_width = pieces_state[bw_idx];
133  CHAR_CHOICE *blob_choice = &this->Blob[i];
134  blob_choice->Class = word_choice.unichar_id(i);
135  blob_choice->NumChunks = blob_width;
136  blob_choice->Certainty = certainties[i];
137  for (int f = 1; f < word_choice.fragment_length(i); ++f) {
138  blob_width = pieces_state[++bw_idx];
139  assert(blob_width > 0);
140  blob_choice->NumChunks += blob_width;
141  this->ComposedFromCharFragments = true;
142  }
143  }
144 }
145 
147  const BLOB_CHOICE_LIST_VECTOR &src_choices) {
148  if (blob_choices != NULL) {
149  blob_choices->deep_clear();
150  } else {
151  blob_choices = new BLOB_CHOICE_LIST_CLIST();
152  }
153  BLOB_CHOICE_LIST_C_IT list_it(blob_choices);
154 
155  for (int i = 0; i < src_choices.size(); ++i) {
156  BLOB_CHOICE_LIST *cc_list = new BLOB_CHOICE_LIST();
157  cc_list->deep_copy(src_choices[i], &BLOB_CHOICE::deep_copy);
158  list_it.add_after_then_move(cc_list);
159  }
160 }
161 
162 namespace tesseract {
163 
164 // If the certainty of any chunk in Choice (item1) is not ambiguous with the
165 // corresponding chunk in the best choice (item2), frees Choice and
166 // returns true.
168  void *item1, // VIABLE_CHOICE Choice,
169  void *item2) { // EXPANDED_CHOICE *BestChoice
170  int i, j, Chunk;
171  FLOAT32 Threshold;
172  VIABLE_CHOICE Choice = reinterpret_cast<VIABLE_CHOICE>(item1);
173  EXPANDED_CHOICE *BestChoice = reinterpret_cast<EXPANDED_CHOICE *>(item2);
174  Threshold = StopperAmbigThreshold(BestChoice->Choice->AdjustFactor,
175  Choice->AdjustFactor);
176  for (i = 0, Chunk = 0; i < Choice->Length; i++) {
177  for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++) {
178  if (Choice->Blob[i].Class != BestChoice->ChunkClass[Chunk] &&
179  Choice->Blob[i].Certainty - BestChoice->ChunkCertainty[Chunk] <
180  Threshold) {
181  if (stopper_debug_level >= 2)
182  PrintViableChoice(stderr, "\nDiscarding bad choice: ", Choice);
183  delete Choice;
184  return true;
185  }
186  }
187  }
188  return false;
189 }
190 
192  WERD_CHOICE *BestChoice,
193  DANGERR *fixpt,
195  bool *modified_blobs) {
196  float CertaintyThreshold = stopper_nondict_certainty_base;
197  int WordSize;
198  if (modified_blobs != NULL) *modified_blobs = false;
199 
200  if (stopper_no_acceptable_choices) return false;
201 
202  if (fixpt != NULL) fixpt->clear();
203  if (BestChoice->length() == 0)
204  return false;
205  if (caller == CHOPPER_CALLER && BestChoice->fragment_mark()) {
206  if (stopper_debug_level >= 1) {
207  cprintf("AcceptableChoice(): a choice with fragments beats BestChoice");
208  }
209  return false;
210  }
211 
212  bool no_dang_ambigs = (GetMaxFixedLengthDawgIndex() >= 0 ||
213  NoDangerousAmbig(BestChoice, fixpt, true,
214  Choices, modified_blobs));
215  bool is_valid_word = valid_word_permuter(BestChoice->permuter(), false);
216  bool is_case_ok = case_ok(*BestChoice, getUnicharset());
217 
218  if (stopper_debug_level >= 1)
219  tprintf("\nStopper: %s (word=%c, case=%c)\n",
220  BestChoice->debug_string().string(),
221  (is_valid_word ? 'y' : 'n'),
222  (is_case_ok ? 'y' : 'n'));
223 
224  // Do not accept invalid words in PASS1.
225  if (reject_offset_ <= 0.0f && !is_valid_word) return false;
226  if (is_valid_word && is_case_ok) {
227  WordSize = LengthOfShortestAlphaRun(*BestChoice);
228  WordSize -= stopper_smallword_size;
229  if (WordSize < 0)
230  WordSize = 0;
231  CertaintyThreshold += WordSize * stopper_certainty_per_char;
232  }
233 
234  if (stopper_debug_level >= 1)
235  tprintf("Stopper: Certainty = %4.1f, Threshold = %4.1f\n",
236  BestChoice->certainty(), CertaintyThreshold);
237 
238  if (no_dang_ambigs &&
239  BestChoice->certainty() > CertaintyThreshold &&
240  UniformCertainties(*Choices, *BestChoice)) {
241  return true;
242  } else {
243  if (stopper_debug_level >= 2) {
244  tprintf("AcceptableChoice() returned false"
245  " (no_dang_ambig:%d cert:%g thresh:%g uniform:%d)\n",
246  no_dang_ambigs, BestChoice->certainty(),
247  CertaintyThreshold,
248  UniformCertainties(*Choices, *BestChoice));
249  }
250  return false;
251  }
252 }
253 
254 bool Dict::AcceptableResult(const WERD_CHOICE &BestChoice) {
255  float CertaintyThreshold = stopper_nondict_certainty_base - reject_offset_;
256  int WordSize;
257 
258  if (stopper_debug_level >= 1) {
259  tprintf("\nRejecter: %s (word=%c, case=%c, unambig=%c)\n",
260  BestChoice.debug_string().string(),
261  (valid_word(BestChoice) ? 'y' : 'n'),
262  (case_ok(BestChoice, getUnicharset()) ? 'y' : 'n'),
263  ((list_rest (best_choices_) != NIL_LIST) ? 'n' : 'y'));
264  }
265 
266  if (BestChoice.length() == 0 || CurrentWordAmbig())
267  return false;
268  if (BestChoice.fragment_mark()) {
269  if (stopper_debug_level >= 1) {
270  cprintf("AcceptableResult(): a choice with fragments beats BestChoice\n");
271  }
272  return false;
273  }
274  if (valid_word(BestChoice) && case_ok(BestChoice, getUnicharset())) {
275  WordSize = LengthOfShortestAlphaRun(BestChoice);
276  WordSize -= stopper_smallword_size;
277  if (WordSize < 0)
278  WordSize = 0;
279  CertaintyThreshold += WordSize * stopper_certainty_per_char;
280  }
281 
282  if (stopper_debug_level >= 1)
283  cprintf ("Rejecter: Certainty = %4.1f, Threshold = %4.1f ",
284  BestChoice.certainty(), CertaintyThreshold);
285 
286  if (BestChoice.certainty() > CertaintyThreshold &&
288  if (stopper_debug_level >= 1)
289  cprintf("ACCEPTED\n");
290  return true;
291  }
292  else {
293  if (stopper_debug_level >= 1)
294  cprintf("REJECTED\n");
295  return false;
296  }
297 }
298 
300  LIST Alternatives;
301  VIABLE_CHOICE Choice;
302  Alternatives = list_rest (best_choices_);
303  iterate(Alternatives) {
304  Choice = (VIABLE_CHOICE) first_node (Alternatives);
305  if (Choice->AdjustFactor <= Threshold)
306  return false;
307  }
308  return true;
309 }
310 
311 bool Dict::CurrentBestChoiceIs(const WERD_CHOICE &WordChoice) {
312  return (best_choices_ != NIL_LIST &&
313  StringSameAs(WordChoice, (VIABLE_CHOICE)first_node(best_choices_)));
314 }
315 
317  VIABLE_CHOICE BestChoice;
318  if (best_choices_ == NIL_LIST)
319  return (MAX_FLOAT32);
320  BestChoice = (VIABLE_CHOICE) first_node (best_choices_);
321  return (BestChoice->AdjustFactor);
322 }
323 
324 
326  return (list_rest (best_choices_) != NIL_LIST);
327 }
328 
329 
331  LIST Choices;
332  int i;
333  char LabelString[80];
334  VIABLE_CHOICE VChoice = (VIABLE_CHOICE)first_node(best_choices_);
335  bool force_debug =
336  fragments_debug && VChoice != NULL && VChoice->ComposedFromCharFragments;
337 
338  if (stopper_debug_level >= 1 || force_debug ||
339  (((STRING)word_to_debug).length() > 0 && best_choices_ &&
340  StringSameAs(word_to_debug.string(), word_to_debug_lengths.string(),
341  (VIABLE_CHOICE)first_node(best_choices_)))) {
342  if (best_raw_choice_)
343  PrintViableChoice(stderr, "\nBest Raw Choice: ", best_raw_choice_);
344 
345  i = 1;
346  Choices = best_choices_;
347  if (Choices)
348  cprintf("\nBest Cooked Choices:\n");
349  iterate(Choices) {
350  sprintf(LabelString, "Cooked Choice #%d: ", i);
351  PrintViableChoice(stderr, LabelString,
352  (VIABLE_CHOICE)first_node(Choices));
353  i++;
354  }
355  }
356 }
357 
358 void Dict::PrintAmbigAlternatives(FILE *file, const char *label,
359  int label_num_unichars) {
360  iterate(raw_choices_) {
361  VIABLE_CHOICE Choice = (VIABLE_CHOICE)first_node(raw_choices_);
362  if (Choice->Length > 0 &&
363  (label_num_unichars > 1 || Choice->Length > 1)) {
364  for (int i = 0; i < Choice->Length; i++) {
365  fprintf(file, "%s",
366  getUnicharset().id_to_unichar(Choice->Blob[i].Class));
367  }
368  fflush(file);
369  fprintf(file, "\t%s\t%.4f\t%.4f\n", label,
370  Choice->Rating, Choice->Certainty);
371  }
372  }
373 }
374 
376  EXPANDED_CHOICE BestChoice;
377 
378  if (best_choices_ == NIL_LIST || second_node (best_choices_) == NIL_LIST)
379  return;
380 
381  // Compute certainties and class for each chunk in best choice.
382  VIABLE_CHOICE_STRUCT *best_choice =
383  (VIABLE_CHOICE_STRUCT *)first_node(best_choices_);
384  ExpandChoice(best_choice, &BestChoice);
385  if (stopper_debug_level >= 2)
386  PrintViableChoice(stderr, "\nFiltering against best choice: ", best_choice);
389  set_rest(best_choices_, delete_d(list_rest(best_choices_),
390  &BestChoice, is_bad));
391  delete is_bad;
392 }
393 
395  FLOAT32 MaxRating,
396  FLOAT32 RatingMargin,
397  FLOAT32 Thresholds[]) {
398  EXPANDED_CHOICE BestRaw;
399  VIABLE_CHOICE Choice;
400  int i, j, Chunk;
401  FLOAT32 AvgRating;
402  int NumErrorChunks;
403 
404  assert (best_choices_ != NIL_LIST);
405  assert (best_raw_choice_ != NULL);
406 
407  ExpandChoice(best_raw_choice_, &BestRaw);
408  Choice = (VIABLE_CHOICE) first_node (best_choices_);
409 
410  for (i = 0, Chunk = 0; i < Choice->Length; i++, Thresholds++) {
411  AvgRating = 0.0;
412  NumErrorChunks = 0;
413 
414  for (j = 0; j < Choice->Blob[i].NumChunks; j++, Chunk++) {
415  if (Choice->Blob[i].Class != BestRaw.ChunkClass[Chunk]) {
416  AvgRating += BestRaw.ChunkCertainty[Chunk];
417  NumErrorChunks++;
418  }
419  }
420 
421  if (NumErrorChunks > 0) {
422  AvgRating /= NumErrorChunks;
423  *Thresholds = (AvgRating / -certainty_scale) * (1.0 - RatingMargin);
424  }
425  else
426  *Thresholds = MaxRating;
427 
428  if (*Thresholds > MaxRating)
429  *Thresholds = MaxRating;
430  if (*Thresholds < MinRating)
431  *Thresholds = MinRating;
432  }
433 }
434 
436  BLOB_WIDTH *BlobWidth, *End;
437 
438  if (best_raw_choice_)
439  delete best_raw_choice_;
440  best_raw_choice_ = NULL;
441 
442  if (best_choices_)
443  destroy_nodes(best_choices_, DeleteViableChoiceStruct);
444  best_choices_ = NIL_LIST;
445 
446  if (raw_choices_)
448  raw_choices_ = NIL_LIST;
449 
451 
452  for (BlobWidth = current_segmentation_,
453  End = current_segmentation_ + MAX_NUM_CHUNKS;
454  BlobWidth < End; *BlobWidth++ = 1);
455 
456 }
457 
459  if (best_choices_) destroy_nodes(best_choices_, DeleteViableChoiceStruct);
460  best_choices_ = NIL_LIST;
461 }
462 
464  BLOB_WIDTH *Segmentation;
465  for (Segmentation = current_segmentation_; *BlobWidth != 0;
466  BlobWidth++, Segmentation++)
467  *Segmentation = *BlobWidth;
468  *Segmentation = 0;
469 }
470 
471 void Dict::LogNewSplit(int Blob) {
472  LIST Choices;
473  if (best_raw_choice_) AddNewChunk(best_raw_choice_, Blob);
474  Choices = best_choices_;
475  iterate(Choices) {
476  AddNewChunk ((VIABLE_CHOICE) first_node (Choices), Blob);
477  }
478  Choices = raw_choices_;
479  iterate(Choices) {
480  AddNewChunk ((VIABLE_CHOICE) first_node (Choices), Blob);
481  }
482 }
483 
484 void Dict::LogNewChoice(FLOAT32 AdjustFactor,
485  const float Certainties[],
486  bool raw_choice,
487  WERD_CHOICE *WordChoice,
488  const BLOB_CHOICE_LIST_VECTOR &blob_choices) {
489  LIST ChoicesList;
490  LIST Choices;
491  FLOAT32 Threshold;
492 
493  if (!keep_word_choices_)
494  return;
495 
496  if (raw_choice) {
497  if (!best_raw_choice_) {
498  best_raw_choice_ =
499  NewViableChoice(*WordChoice, AdjustFactor, Certainties);
500  } else if (WordChoice->rating() < best_raw_choice_->Rating) {
501  if (ChoiceSameAs(*WordChoice, best_raw_choice_)) {
502  FillViableChoice(*WordChoice, AdjustFactor, Certainties,
503  best_raw_choice_);
504  } else {
505  delete best_raw_choice_;
506  best_raw_choice_ =
507  NewViableChoice(*WordChoice, AdjustFactor, Certainties);
508  }
509  }
510  if (!save_raw_choices) return;
511  ChoicesList = raw_choices_;
512  } else {
513  ChoicesList = best_choices_;
514  }
515 
516  // Throw out obviously bad choices to save some work.
517  if (ChoicesList != NIL_LIST) {
518  Threshold = StopperAmbigThreshold(BestFactor(ChoicesList), AdjustFactor);
519  if (Threshold > -stopper_ambiguity_threshold_offset)
521  if (WordChoice->certainty() - BestCertainty (ChoicesList) < Threshold) {
522  // Set the rating of the word to be terrible, so that it does not
523  // get chosen as the best choice.
524  if (stopper_debug_level >= 2) {
525  STRING bad_string;
526  WordChoice->string_and_lengths(&bad_string, NULL);
527  tprintf("Discarding choice \"%s\" with an overly low certainty"
528  " %.4f vs best choice certainty %.4f (Threshold: %.4f)\n",
529  bad_string.string(), WordChoice->certainty(),
530  BestCertainty(ChoicesList),
531  Threshold + BestCertainty(ChoicesList));
532  }
533  WordChoice->set_rating(WERD_CHOICE::kBadRating);
534  return;
535  }
536  }
537 
538  // See if a choice with the same text string has already been found.
539  VIABLE_CHOICE NewChoice = NULL;
540  Choices = ChoicesList;
541 
542  iterate(Choices) {
543  if (ChoiceSameAs (*WordChoice, (VIABLE_CHOICE) first_node (Choices))) {
544  if (WordChoice->rating() < BestRating (Choices)) {
545  NewChoice = (VIABLE_CHOICE) first_node (Choices);
546  } else {
547  return;
548  }
549  }
550  }
551 
552  if (NewChoice) {
553  FillViableChoice(*WordChoice, AdjustFactor, Certainties, NewChoice);
554  ChoicesList = delete_d(ChoicesList, NewChoice, is_same_node);
555  } else {
556  NewChoice = NewViableChoice(*WordChoice, AdjustFactor, Certainties);
557  }
558 
559  // Now we know we're gonna save it, so add the expensive copy.
560  NewChoice->SetBlobChoices(blob_choices);
561 
562  ChoicesList = s_adjoin (ChoicesList, NewChoice, CmpChoiceRatings);
563  if (stopper_debug_level >= 2)
564  raw_choice ? PrintViableChoice (stderr, "New Raw Choice: ", NewChoice) :
565  PrintViableChoice (stderr, "New Word Choice: ", NewChoice);
566  if (count (ChoicesList) > tessedit_truncate_wordchoice_log) {
567  Choices =
570  set_rest(Choices, NIL_LIST);
571  }
572 
573  // Update raw_choices_/best_choices_ pointer.
574  if (raw_choice) {
575  raw_choices_ = ChoicesList;
576  } else {
577  best_choices_ = ChoicesList;
578  }
579 }
580 
582  DANGERR *fixpt,
583  bool fix_replaceable,
584  BLOB_CHOICE_LIST_VECTOR *blob_choices,
585  bool *modified_blobs) {
586  if (stopper_debug_level > 2) {
587  tprintf("\nRunning NoDangerousAmbig() for %s\n",
588  best_choice->debug_string().string());
589  }
590 
591  // Construct BLOB_CHOICE_LIST_VECTOR with ambiguities
592  // for each unichar id in BestChoice.
593  BLOB_CHOICE_LIST_VECTOR ambig_blob_choices;
594  int i;
595  bool modified_best_choice = false;
596  bool ambigs_found = false;
597  // For each position in best_choice:
598  // -- choose AMBIG_SPEC_LIST that corresponds to unichar_id at best_choice[i]
599  // -- initialize wrong_ngram with a single unichar_id at best_choice[i]
600  // -- look for ambiguities corresponding to wrong_ngram in the list while
601  // adding the following unichar_ids from best_choice to wrong_ngram
602  //
603  // Repeat the above procedure twice: first time look through
604  // ambigs to be replaced and replace all the ambiguities found;
605  // second time look through dangerous ambiguities and construct
606  // ambig_blob_choices with fake a blob choice for each ambiguity
607  // and pass them to dawg_permute_and_select() to search for
608  // ambiguous words in the dictionaries.
609  //
610  // Note that during the execution of the for loop (on the first pass)
611  // if replacements are made the length of best_choice might change.
612  for (int pass = 0; pass < (fix_replaceable ? 2 : 1); ++pass) {
613  bool replace = (fix_replaceable && pass == 0);
614  const UnicharAmbigsVector &table = replace ?
616  if (!replace) {
617  // Initialize ambig_blob_choices with lists containing a single
618  // unichar id for the correspoding position in best_choice.
619  // best_choice consisting from only the original letters will
620  // have a rating of 0.0.
621  for (i = 0; i < best_choice->length(); ++i) {
622  BLOB_CHOICE_LIST *lst = new BLOB_CHOICE_LIST();
623  BLOB_CHOICE_IT lst_it(lst);
624  // TODO(rays/antonova) Should these BLOB_CHOICEs use real xheights
625  // or are these fake ones good enough?
626  lst_it.add_to_end(new BLOB_CHOICE(best_choice->unichar_id(i),
627  0.0, 0.0, -1, -1, -1, 0, 1, false));
628  ambig_blob_choices.push_back(lst);
629  }
630  }
631  UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE + 1];
632  int wrong_ngram_index;
633  int next_index;
634  int blob_index = 0;
635  for (i = 0; i < best_choice->length(); ++i) {
636  if (i > 0) blob_index += best_choice->fragment_length(i-1);
637  UNICHAR_ID curr_unichar_id = best_choice->unichar_id(i);
638  if (stopper_debug_level > 2) {
639  tprintf("Looking for %s ngrams starting with %s:\n",
640  replace ? "replaceable" : "ambiguous",
641  getUnicharset().debug_str(curr_unichar_id).string());
642  }
643  wrong_ngram_index = 0;
644  wrong_ngram[wrong_ngram_index] = curr_unichar_id;
645  if (curr_unichar_id == INVALID_UNICHAR_ID ||
646  curr_unichar_id >= table.size() ||
647  table[curr_unichar_id] == NULL) {
648  continue; // there is no ambig spec for this unichar id
649  }
650  AmbigSpec_IT spec_it(table[curr_unichar_id]);
651  for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();) {
652  const AmbigSpec *ambig_spec = spec_it.data();
653  wrong_ngram[wrong_ngram_index+1] = INVALID_UNICHAR_ID;
654  int compare = UnicharIdArrayUtils::compare(wrong_ngram,
655  ambig_spec->wrong_ngram);
656  if (stopper_debug_level > 2) {
657  tprintf("candidate ngram: ");
659  tprintf("current ngram from spec: ");
661  tprintf("comparison result: %d\n", compare);
662  }
663  if (compare == 0) {
664  // Record the place where we found an ambiguity.
665  if (fixpt != NULL) {
666  fixpt->push_back(DANGERR_INFO(
667  blob_index, blob_index+wrong_ngram_index, replace,
668  getUnicharset().get_isngram(ambig_spec->correct_ngram_id)));
669  if (stopper_debug_level > 1) {
670  tprintf("fixpt+=(%d %d %d %d)\n", blob_index,
671  blob_index+wrong_ngram_index, false,
672  getUnicharset().get_isngram(
673  ambig_spec->correct_ngram_id));
674  }
675  }
676 
677  if (replace) {
678  if (stopper_debug_level > 2) {
679  tprintf("replace ambiguity with: ");
681  ambig_spec->correct_fragments, getUnicharset());
682  }
683  ReplaceAmbig(i, ambig_spec->wrong_ngram_size,
684  ambig_spec->correct_ngram_id,
685  best_choice, blob_choices, modified_blobs);
686  modified_best_choice = true;
687  } else if (i > 0 || ambig_spec->type != CASE_AMBIG) {
688  // We found dang ambig - update ambig_blob_choices.
689  if (stopper_debug_level > 2) {
690  tprintf("found ambiguity: ");
692  ambig_spec->correct_fragments, getUnicharset());
693  }
694  ambigs_found = true;
695  for (int tmp_index = 0; tmp_index <= wrong_ngram_index;
696  ++tmp_index) {
697  // Add a blob choice for the corresponding fragment of the
698  // ambiguity. These fake blob choices are initialized with
699  // negative ratings (which are not possible for real blob
700  // choices), so that dawg_permute_and_select() considers any
701  // word not consisting of only the original letters a better
702  // choice and stops searching for alternatives once such a
703  // choice is found.
704  BLOB_CHOICE_IT bc_it(ambig_blob_choices[i+tmp_index]);
705  bc_it.add_to_end(new BLOB_CHOICE(
706  ambig_spec->correct_fragments[tmp_index], -1.0, 0.0,
707  -1, -1, -1, 0, 1, false));
708  }
709  }
710  spec_it.forward();
711  } else if (compare == -1) {
712  if (wrong_ngram_index+1 < ambig_spec->wrong_ngram_size &&
713  ((next_index = wrong_ngram_index+1+i) < best_choice->length())) {
714  // Add the next unichar id to wrong_ngram and keep looking for
715  // more ambigs starting with curr_unichar_id in AMBIG_SPEC_LIST.
716  wrong_ngram[++wrong_ngram_index] =
717  best_choice->unichar_id(next_index);
718  } else {
719  break; // no more matching ambigs in this AMBIG_SPEC_LIST
720  }
721  } else {
722  spec_it.forward();
723  }
724  } // end searching AmbigSpec_LIST
725  } // end searching best_choice
726  } // end searching replace and dangerous ambigs
727 
728  // If any ambiguities were found permute the constructed ambig_blob_choices
729  // to see if an alternative dictionary word can be found.
730  if (ambigs_found) {
731  if (stopper_debug_level > 2) {
732  tprintf("\nResulting ambig_blob_choices:\n");
733  for (i = 0; i < ambig_blob_choices.length(); ++i) {
734  print_ratings_list("", ambig_blob_choices.get(i), getUnicharset());
735  tprintf("\n");
736  }
737  }
738  WERD_CHOICE *alt_word = dawg_permute_and_select(ambig_blob_choices, 0.0);
739  ambigs_found = (alt_word->rating() < 0.0);
740  if (ambigs_found) {
741  if (stopper_debug_level >= 1) {
742  tprintf ("Stopper: Possible ambiguous word = %s\n",
743  alt_word->debug_string().string());
744  }
745  if (fixpt != NULL) {
746  // Note: Currently character choices combined from fragments can only
747  // be generated by NoDangrousAmbigs(). This code should be updated if
748  // the capability to produce classifications combined from character
749  // fragments is added to other functions.
750  int orig_i = 0;
751  for (i = 0; i < alt_word->length(); ++i) {
752  bool replacement_is_ngram =
753  getUnicharset().get_isngram(alt_word->unichar_id(i));
754  int end_i = orig_i + alt_word->fragment_length(i) - 1;
755  if (alt_word->fragment_length(i) > 1 ||
756  (orig_i == end_i && replacement_is_ngram)) {
757  fixpt->push_back(DANGERR_INFO(orig_i, end_i, true,
758  replacement_is_ngram));
759  if (stopper_debug_level > 1) {
760  tprintf("fixpt->dangerous+=(%d %d %d %d)\n", orig_i, end_i,
761  true, replacement_is_ngram);
762  }
763  }
764  orig_i += alt_word->fragment_length(i);
765  }
766  }
767  }
768  delete alt_word;
769  }
770  if (output_ambig_words_file_ != NULL) {
771  fprintf(output_ambig_words_file_, "\n");
772  }
773 
774  ambig_blob_choices.delete_data_pointers();
775  return !ambigs_found;
776 }
777 
779 
781  reject_offset_ = 0.0;
782 }
783 
786 }
787 
788 void Dict::AddNewChunk(VIABLE_CHOICE Choice, int Blob) {
789  int i, LastChunk;
790  for (i = 0, LastChunk = 0; i < Choice->Length; i++) {
791  LastChunk += Choice->Blob[i].NumChunks;
792  if (Blob < LastChunk) {
793  (Choice->Blob[i].NumChunks)++;
794  return;
795  }
796  }
797  cprintf ("AddNewChunk failed:Choice->Length=%d, LastChunk=%d, Blob=%d\n",
798  Choice->Length, LastChunk, Blob);
799  assert(false); // this should never get executed
800 }
801 
802 void Dict::ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size,
803  UNICHAR_ID correct_ngram_id, WERD_CHOICE *werd_choice,
804  BLOB_CHOICE_LIST_VECTOR *blob_choices,
805  bool *modified_blobs) {
806  int num_blobs_to_replace = 0;
807  int begin_blob_index = 0;
808  int i;
809  for (i = 0; i < wrong_ngram_begin_index + wrong_ngram_size; ++i) {
810  if (i >= wrong_ngram_begin_index) {
811  num_blobs_to_replace += werd_choice->fragment_length(i);
812  } else {
813  begin_blob_index += werd_choice->fragment_length(i);
814  }
815  }
816  BLOB_CHOICE_IT bit;
817  int temp_blob_index = begin_blob_index;
818  const char *temp_uch = NULL;
819  const char *correct_ngram_str =
820  getUnicharset().id_to_unichar(correct_ngram_id);
821  for (int replaced_count = 0; replaced_count < wrong_ngram_size;
822  ++replaced_count) {
823  if (blob_choices != NULL) {
824  UNICHAR_ID uch_id = werd_choice->unichar_id(wrong_ngram_begin_index);
825  int fraglen = werd_choice->fragment_length(wrong_ngram_begin_index);
826  if (fraglen > 1) temp_uch = getUnicharset().id_to_unichar(uch_id);
827  for (i = 0; i < fraglen; ++i) {
828  if (fraglen > 1) {
829  STRING frag_str =
830  CHAR_FRAGMENT::to_string(temp_uch, i, fraglen, false);
831  getUnicharset().unichar_insert(frag_str.string());
832  uch_id = getUnicharset().unichar_to_id(frag_str.string());
833  }
834  bit.set_to_list(blob_choices->get(temp_blob_index));
835  STRING correct_frag_uch =
836  CHAR_FRAGMENT::to_string(correct_ngram_str,
837  temp_blob_index - begin_blob_index,
838  num_blobs_to_replace, false);
839  getUnicharset().unichar_insert(correct_frag_uch.string());
840  UNICHAR_ID correct_frag_uch_id =
841  getUnicharset().unichar_to_id(correct_frag_uch.string());
842  // Find the WERD_CHOICE corresponding to the original unichar in
843  // the list of blob choices, add the derived character fragment
844  // before it with the same rating and certainty.
845  for (bit.mark_cycle_pt(); !bit.cycled_list(); bit.forward()) {
846  if (bit.data()->unichar_id() == correct_frag_uch_id) {
847  break; // the unichar we want to insert is already there
848  }
849  if (bit.data()->unichar_id() == uch_id) {
850  bit.add_before_then_move(new BLOB_CHOICE(*(bit.data())));
851  bit.data()->set_unichar_id(correct_frag_uch_id);
852  if (modified_blobs != NULL) *modified_blobs = true;
853  break;
854  }
855  }
856  temp_blob_index++;
857  }
858  }
859  // Remove current unichar from werd_choice. On the last iteration
860  // set the correct replacement unichar instead of removing a unichar.
861  if (replaced_count + 1 == wrong_ngram_size) {
862  werd_choice->set_unichar_id(correct_ngram_id,
863  num_blobs_to_replace, 0.0, 0.0, wrong_ngram_begin_index);
864  } else {
865  werd_choice->remove_unichar_id(wrong_ngram_begin_index);
866  }
867  }
868  if (stopper_debug_level >= 1 && modified_blobs != NULL &&
869  *modified_blobs && blob_choices != NULL) {
870  werd_choice->print("ReplaceAmbig() ");
871  tprintf("Modified blob_choices: ");
872  for (int i = 0; i < blob_choices->size(); ++i) {
873  print_ratings_list("\n", blob_choices->get(i), getUnicharset());
874  }
875  }
876 }
877 
878 int Dict::ChoiceSameAs(const WERD_CHOICE &WordChoice,
879  VIABLE_CHOICE ViableChoice) {
880  return (StringSameAs(WordChoice, ViableChoice));
881 }
882 
884  int shortest = MAX_INT32;
885  int curr_len = 0;
886  for (int w = 0; w < WordChoice.length(); ++w) {
887  if (getUnicharset().get_isalpha(WordChoice.unichar_id(w))) {
888  curr_len++;
889  } else if (curr_len > 0) {
890  if (curr_len < shortest) shortest = curr_len;
891  curr_len = 0;
892  }
893  }
894  if (curr_len > 0 && curr_len < shortest) {
895  shortest = curr_len;
896  } else if (shortest == MAX_INT32) {
897  shortest = 0;
898  }
899  return shortest;
900 }
901 
903  FLOAT32 AdjustFactor,
904  const float Certainties[]) {
905  int Length = WordChoice.length();
906  assert (Length <= MAX_NUM_CHUNKS && Length > 0);
907  VIABLE_CHOICE NewChoice = new VIABLE_CHOICE_STRUCT(Length);
908  FillViableChoice(WordChoice, AdjustFactor, Certainties, NewChoice);
909  return NewChoice;
910 }
911 
912 void Dict::PrintViableChoice(FILE *File, const char *Label, VIABLE_CHOICE Choice) {
913  int i, j;
914  fprintf (File, "%s", Label);
915  fprintf(File, "(R=%5.1f, C=%4.1f, F=%4.2f, Frag=%d) ",
916  Choice->Rating, Choice->Certainty,
917  Choice->AdjustFactor, Choice->ComposedFromCharFragments);
918 
919  for (i = 0; i < Choice->Length; i++)
920  fprintf(File, "%s", getUnicharset().id_to_unichar(Choice->Blob[i].Class));
921  fprintf(File, "\n");
922 
923  for (i = 0; i < Choice->Length; i++) {
924  fprintf(File, " %s", getUnicharset().id_to_unichar(Choice->Blob[i].Class));
925  for (j = 0; j < Choice->Blob[i].NumChunks - 1; j++)
926  fprintf(File, " ");
927  }
928  fprintf(File, "\n");
929 
930  for (i = 0; i < Choice->Length; i++) {
931  for (j = 0; j < Choice->Blob[i].NumChunks; j++)
932  fprintf(File, "%3d ", (int) (Choice->Blob[i].Certainty * -10.0));
933  }
934  fprintf(File, "\n");
935 
936  for (i = 0; i < Choice->Length; i++) {
937  for (j = 0; j < Choice->Blob[i].NumChunks; j++)
938  fprintf(File, "%3d ", Choice->Blob[i].NumChunks);
939  }
940  fprintf(File, "\n");
941 }
942 
943 void Dict::FillViableChoice(const WERD_CHOICE &WordChoice,
944  FLOAT32 AdjustFactor, const float Certainties[],
945  VIABLE_CHOICE ViableChoice) {
946  ViableChoice->Init(WordChoice, current_segmentation_, Certainties,
947  AdjustFactor);
948 
949 }
950 
951 bool Dict::StringSameAs(const WERD_CHOICE &WordChoice,
952  VIABLE_CHOICE ViableChoice) {
953  if (WordChoice.length() != ViableChoice->Length) {
954  return false;
955  }
956  int i;
957  CHAR_CHOICE *CharChoice;
958  for (i = 0, CharChoice = &(ViableChoice->Blob[0]);
959  i < ViableChoice->Length; CharChoice++, i++) {
960  if (CharChoice->Class != WordChoice.unichar_id(i)) {
961  return false;
962  }
963  }
964  return true;
965 }
966 
967 bool Dict::StringSameAs(const char *String,
968  const char *String_lengths,
969  VIABLE_CHOICE ViableChoice) {
970  CHAR_CHOICE *Char;
971  int i;
972  int current_unichar_length;
973 
974  for (Char = &(ViableChoice->Blob[0]), i = 0;
975  i < ViableChoice->Length;
976  String += *(String_lengths++), Char++, i++) {
977  current_unichar_length = strlen(getUnicharset().id_to_unichar(Char->Class));
978  if (current_unichar_length != *String_lengths ||
979  strncmp(String, getUnicharset().id_to_unichar(Char->Class),
980  current_unichar_length) != 0)
981  return false;
982  }
983  return (*String == 0) ? true : false;
984 }
985 
987  const WERD_CHOICE &BestChoice) {
988  float Certainty;
989  float WorstCertainty = MAX_FLOAT32;
990  float CertaintyThreshold;
991  FLOAT64 TotalCertainty;
992  FLOAT64 TotalCertaintySquared;
993  FLOAT64 Variance;
994  FLOAT32 Mean, StdDev;
995  int WordLength;
996 
997  WordLength = Choices.length();
998  if (WordLength < 3)
999  return true;
1000 
1001  TotalCertainty = TotalCertaintySquared = 0.0;
1002  BLOB_CHOICE_IT BlobChoiceIt;
1003  for (int i = 0; i < Choices.length(); ++i) {
1004  BlobChoiceIt.set_to_list(Choices.get(i));
1005  Certainty = BlobChoiceIt.data()->certainty();
1006  TotalCertainty += Certainty;
1007  TotalCertaintySquared += Certainty * Certainty;
1008  if (Certainty < WorstCertainty)
1009  WorstCertainty = Certainty;
1010  }
1011 
1012  // Subtract off worst certainty from statistics.
1013  WordLength--;
1014  TotalCertainty -= WorstCertainty;
1015  TotalCertaintySquared -= WorstCertainty * WorstCertainty;
1016 
1017  Mean = TotalCertainty / WordLength;
1018  Variance = ((WordLength * TotalCertaintySquared -
1019  TotalCertainty * TotalCertainty) /
1020  (WordLength * (WordLength - 1)));
1021  if (Variance < 0.0)
1022  Variance = 0.0;
1023  StdDev = sqrt (Variance);
1024 
1025  CertaintyThreshold = Mean - stopper_allowable_character_badness * StdDev;
1026  if (CertaintyThreshold > stopper_nondict_certainty_base)
1027  CertaintyThreshold = stopper_nondict_certainty_base;
1028 
1029  if (BestChoice.certainty() < CertaintyThreshold) {
1030  if (stopper_debug_level >= 1)
1031  cprintf("Stopper: Non-uniform certainty = %4.1f"
1032  " (m=%4.1f, s=%4.1f, t=%4.1f)\n",
1033  BestChoice.certainty(), Mean, StdDev, CertaintyThreshold);
1034  return false;
1035  } else {
1036  return true;
1037  }
1038 }
1039 
1040 } // namespace tesseract