Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
ratngs.h
Go to the documentation of this file.
1 /**********************************************************************
2  * File: ratngs.h (Formerly ratings.h)
3  * Description: Definition of the WERD_CHOICE and BLOB_CHOICE classes.
4  * Author: Ray Smith
5  * Created: Thu Apr 23 11:40:38 BST 1992
6  *
7  * (C) Copyright 1992, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #ifndef RATNGS_H
21 #define RATNGS_H
22 
23 #include <assert.h>
24 
25 #include "clst.h"
26 #include "genericvector.h"
27 #include "notdll.h"
28 #include "unichar.h"
29 #include "unicharset.h"
30 #include "werd.h"
31 
32 class BLOB_CHOICE: public ELIST_LINK
33 {
34  public:
36  unichar_id_ = INVALID_UNICHAR_ID;
37  fontinfo_id_ = -1;
38  fontinfo_id2_ = -1;
39  rating_ = MAX_FLOAT32;
40  certainty_ = -MAX_FLOAT32;
41  script_id_ = -1;
42  language_model_state_ = NULL;
43  min_xheight_ = 0;
44  max_xheight_ = 0;
45  adapted_ = false;
46  }
47  BLOB_CHOICE(UNICHAR_ID src_unichar_id, // character id
48  float src_rating, // rating
49  float src_cert, // certainty
50  inT16 src_fontinfo_id, // font
51  inT16 src_fontinfo_id2, // 2nd choice font
52  int script_id, // script
53  inT16 min_xheight, // min xheight in image pixel units
54  inT16 max_xheight, // max xheight allowed by this char
55  bool adapted); // adapted match or not
56  BLOB_CHOICE(const BLOB_CHOICE &other);
58 
60  return unichar_id_;
61  }
62  float rating() const {
63  return rating_;
64  }
65  float certainty() const {
66  return certainty_;
67  }
68  inT16 fontinfo_id() const {
69  return fontinfo_id_;
70  }
71  inT16 fontinfo_id2() const {
72  return fontinfo_id2_;
73  }
74  int script_id() const {
75  return script_id_;
76  }
78  return language_model_state_;
79  }
80  inT16 xgap_before() const {
81  return xgap_before_;
82  }
83  inT16 xgap_after() const {
84  return xgap_after_;
85  }
86  inT16 min_xheight() const {
87  return min_xheight_;
88  }
89  inT16 max_xheight() const {
90  return max_xheight_;
91  }
92  bool adapted() const {
93  return adapted_;
94  }
95 
96  void set_unichar_id(UNICHAR_ID newunichar_id) {
97  unichar_id_ = newunichar_id;
98  }
99  void set_rating(float newrat) {
100  rating_ = newrat;
101  }
102  void set_certainty(float newrat) {
103  certainty_ = newrat;
104  }
105  void set_fontinfo_id(inT16 newfont) {
106  fontinfo_id_ = newfont;
107  }
108  void set_fontinfo_id2(inT16 newfont) {
109  fontinfo_id2_ = newfont;
110  }
111  void set_script(int newscript_id) {
112  script_id_ = newscript_id;
113  }
115  language_model_state_ = language_model_state;
116  }
117  void set_xgap_before(inT16 gap) {
118  xgap_before_ = gap;
119  }
120  void set_xgap_after(inT16 gap) {
121  xgap_after_ = gap;
122  }
123  void set_adapted(bool adapted) {
124  adapted_ = adapted;
125  }
126  static BLOB_CHOICE* deep_copy(const BLOB_CHOICE* src) {
127  BLOB_CHOICE* choice = new BLOB_CHOICE;
128  *choice = *src;
129  return choice;
130  }
131  void print(const UNICHARSET *unicharset) {
132  tprintf("r%.2f c%.2f : %d %s", rating_, certainty_, unichar_id_,
133  (unicharset == NULL) ? "" :
134  unicharset->debug_str(unichar_id_).string());
135  }
136 
137  private:
138  UNICHAR_ID unichar_id_; // unichar id
139  inT16 fontinfo_id_; // char font information
140  inT16 fontinfo_id2_; // 2nd choice font information
141  // Rating is the classifier distance weighted by the length of the outline
142  // in the blob. In terms of probability, classifier distance is -klog p such
143  // that the resulting distance is in the range [0, 1] and then
144  // rating = w (-k log p) where w is the weight for the length of the outline.
145  // Sums of ratings may be compared meaningfully for words of different
146  // segmentation.
147  float rating_; // size related
148  // Certainty is a number in [-20, 0] indicating the classifier certainty
149  // of the choice. In terms of probability, certainty = 20 (k log p) where
150  // k is defined as above to normalize -klog p to the range [0, 1].
151  float certainty_; // absolute
152  int script_id_;
153  // Stores language model information about this BLOB_CHOICE. Used during
154  // the segmentation search for BLOB_CHOICEs in BLOB_CHOICE_LISTs that are
155  // recorded in the ratings matrix.
156  // The pointer is owned/managed by the segmentation search.
157  void *language_model_state_;
158  inT16 xgap_before_;
159  inT16 xgap_after_;
160  // X-height range (in image pixels) that this classification supports.
161  inT16 min_xheight_;
162  inT16 max_xheight_;
163  bool adapted_; // true if this is a match from adapted templates
164 };
165 
166 // Make BLOB_CHOICE listable.
167 ELISTIZEH (BLOB_CHOICE) CLISTIZEH (BLOB_CHOICE_LIST)
168 
169 // Permuter codes used in WERD_CHOICEs.
170 enum PermuterType {
171  NO_PERM, // 0
172  PUNC_PERM, // 1
173  TOP_CHOICE_PERM, // 2
174  LOWER_CASE_PERM, // 3
175  UPPER_CASE_PERM, // 4
176  NGRAM_PERM, // 5
177  NUMBER_PERM, // 6
178  USER_PATTERN_PERM, // 7
179  SYSTEM_DAWG_PERM, // 8
180  DOC_DAWG_PERM, // 9
181  USER_DAWG_PERM, // 10
182  FREQ_DAWG_PERM, // 11
183  COMPOUND_PERM, // 12
184 };
185 
186 class WERD_CHOICE {
187  public:
188  static const float kBadRating;
189 
191  : unicharset_(unicharset) { this->init(8); }
192  WERD_CHOICE(const UNICHARSET *unicharset, int reserved)
193  : unicharset_(unicharset) { this->init(reserved); }
194  WERD_CHOICE(const char *src_string,
195  const char *src_lengths,
196  float src_rating,
197  float src_certainty,
198  uinT8 src_permuter,
199  const UNICHARSET &unicharset)
200  : unicharset_(&unicharset) {
201  this->init(src_string, src_lengths, src_rating,
202  src_certainty, src_permuter);
203  }
204  WERD_CHOICE(const char *src_string, const UNICHARSET &unicharset);
205  WERD_CHOICE(const WERD_CHOICE &word) : unicharset_(word.unicharset_) {
206  this->init(word.length());
207  this->operator=(word);
208  }
209  ~WERD_CHOICE();
210 
211  const UNICHARSET *unicharset() const {
212  return unicharset_;
213  }
214  inline int length() const {
215  return length_;
216  }
217  inline const UNICHAR_ID *unichar_ids() const {
218  return unichar_ids_;
219  }
220  inline const UNICHAR_ID unichar_id(int index) const {
221  assert(index < length_);
222  return unichar_ids_[index];
223  }
224  inline const char *fragment_lengths() const {
225  return fragment_lengths_;
226  }
227  inline const char fragment_length(int index) const {
228  assert(index < length_);
229  return fragment_lengths_[index];
230  }
231  inline float rating() const {
232  return rating_;
233  }
234  inline float certainty() const {
235  return certainty_;
236  }
237  inline uinT8 permuter() const {
238  return permuter_;
239  }
240  const char *permuter_name() const;
241  inline bool fragment_mark() const {
242  return fragment_mark_;
243  }
244  inline BLOB_CHOICE_LIST_CLIST* blob_choices() {
245  return blob_choices_;
246  }
247  inline void set_unichar_id(UNICHAR_ID unichar_id, int index) {
248  assert(index < length_);
249  unichar_ids_[index] = unichar_id;
250  }
251  inline void set_fragment_length(char flen, int index) {
252  assert(index < length_);
253  fragment_lengths_[index] = flen;
254  }
255  inline void set_rating(float new_val) {
256  rating_ = new_val;
257  }
258  inline void set_certainty(float new_val) {
259  certainty_ = new_val;
260  }
261  inline void set_permuter(uinT8 perm) {
262  permuter_ = perm;
263  }
264  inline void set_fragment_mark(bool new_fragment_mark) {
265  fragment_mark_ = new_fragment_mark;
266  }
267  // Note: this function should only be used if all the fields
268  // are populated manually with set_* functions (rather than
269  // (copy)constructors and append_* functions).
270  inline void set_length(int len) {
271  ASSERT_HOST(reserved_ >= len);
272  length_ = len;
273  }
274  void set_blob_choices(BLOB_CHOICE_LIST_CLIST *blob_choices);
275 
277  inline void double_the_size() {
278  if (reserved_ > 0) {
280  reserved_, unichar_ids_);
282  reserved_, fragment_lengths_);
283  reserved_ *= 2;
284  } else {
285  unichar_ids_ = new UNICHAR_ID[1];
286  fragment_lengths_ = new char[1];
287  reserved_ = 1;
288  }
289  }
290 
293  inline void init(int reserved) {
294  reserved_ = reserved;
295  if (reserved > 0) {
296  unichar_ids_ = new UNICHAR_ID[reserved];
297  fragment_lengths_ = new char[reserved];
298  } else {
299  unichar_ids_ = NULL;
300  fragment_lengths_ = NULL;
301  }
302  length_ = 0;
303  rating_ = 0.0;
304  certainty_ = MAX_FLOAT32;
305  permuter_ = NO_PERM;
306  fragment_mark_ = false;
307  blob_choices_ = NULL;
308  unichars_in_script_order_ = false; // Tesseract is strict left-to-right.
309  }
310 
316  void init(const char *src_string, const char *src_lengths,
317  float src_rating, float src_certainty,
318  uinT8 src_permuter);
319 
321  inline void make_bad() {
322  length_ = 0;
323  rating_ = kBadRating;
324  certainty_ = -MAX_FLOAT32;
325  fragment_mark_ = false;
326  }
327 
333  float rating, float certainty) {
334  assert(reserved_ > length_);
335  length_++;
336  this->set_unichar_id(unichar_id, fragment_length,
337  rating, certainty, length_-1);
338  }
339 
341  float rating, float certainty);
342 
344  float rating, float certainty, int index) {
345  assert(index < length_);
346  unichar_ids_[index] = unichar_id;
347  fragment_lengths_[index] = fragment_length;
348  rating_ += rating;
349  if (certainty < certainty_) {
350  certainty_ = certainty;
351  }
352  }
353 
355  void remove_unichar_ids(int index, int num);
356  inline void remove_last_unichar_id() { --length_; }
357  inline void remove_unichar_id(int index) {
358  this->remove_unichar_ids(index, 1);
359  }
360  bool has_rtl_unichar_id() const;
362 
363  // Returns the half-open interval of unichar_id indices [start, end) which
364  // enclose the core portion of this word -- the part after stripping
365  // punctuation from the left and right.
366  void punct_stripped(int *start_core, int *end_core) const;
367 
368  // Return a copy of this WERD_CHOICE with the choices [start, end).
369  // The result is useful only for checking against a dictionary.
370  WERD_CHOICE shallow_copy(int start, int end) const;
371 
372  void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const;
373  const STRING debug_string() const {
374  STRING word_str;
375  for (int i = 0; i < length_; ++i) {
376  word_str += unicharset_->debug_str(unichar_ids_[i]);
377  word_str += " ";
378  }
379  return word_str;
380  }
381 
382  // Call this to override the default (strict left to right graphemes)
383  // with the fact that some engine produces a "reading order" set of
384  // Graphemes for each word.
385  bool set_unichars_in_script_order(bool in_script_order) {
386  return unichars_in_script_order_ = in_script_order;
387  }
388 
390  return unichars_in_script_order_;
391  }
392 
393  // Returns a UTF-8 string equivalent to the current choice
394  // of UNICHAR IDs.
395  const STRING &unichar_string() const {
396  this->string_and_lengths(&unichar_string_, &unichar_lengths_);
397  return unichar_string_;
398  }
399 
400  // Returns the lengths, one byte each, representing the number of bytes
401  // required in the unichar_string for each UNICHAR_ID.
402  const STRING &unichar_lengths() const {
403  this->string_and_lengths(&unichar_string_, &unichar_lengths_);
404  return unichar_lengths_;
405  }
406  const void print() const { this->print(""); }
407  const void print(const char *msg) const;
408 
409  WERD_CHOICE& operator+= ( // concatanate
410  const WERD_CHOICE & second);// second on first
411 
412  WERD_CHOICE& operator= (const WERD_CHOICE& source);
413 
414  private:
415  const UNICHARSET *unicharset_;
416  UNICHAR_ID *unichar_ids_; // unichar ids that represent the text of the word
417  char *fragment_lengths_; // number of fragments in each unichar
418  int reserved_; // size of the above arrays
419  int length_; // word length
420  // Rating is the sum of the ratings of the individual blobs in the word.
421  float rating_; // size related
422  // certainty is the min (worst) certainty of the individual blobs in the word.
423  float certainty_; // absolute
424  uinT8 permuter_; // permuter code
425  bool fragment_mark_; // if true, indicates that this choice
426  // was chosen over a better one that
427  // contained a fragment
428  BLOB_CHOICE_LIST_CLIST *blob_choices_; // best choices for each blob
429 
430  // Normally, the blob_choices_ represent the recognition results in order
431  // from left-to-right. However, some engines (say Cube) may return
432  // recognition results in the order of the script's major reading direction
433  // (for Arabic, that is right-to-left).
434  bool unichars_in_script_order_;
435 
436  // The following variables are populated and passed by reference any
437  // time unichar_string() or unichar_lengths() are called.
438  mutable STRING unichar_string_;
439  mutable STRING unichar_lengths_;
440 
441  bool unichar_info_present;
442 
443  private:
444  void delete_blob_choices();
445 };
446 
447 // Make WERD_CHOICE listable.
449 typedef GenericVector<BLOB_CHOICE_LIST *> BLOB_CHOICE_LIST_VECTOR;
450 typedef GenericVector<WERD_CHOICE_LIST *> WERD_CHOICE_LIST_VECTOR;
451 
452 // Utilities for comparing WERD_CHOICEs
453 
455  const WERD_CHOICE &word2);
456 
457 // Utilities for debug printing.
458 void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings);
459 void print_ratings_list(
460  const char *msg, // intro message
461  BLOB_CHOICE_LIST *ratings, // list of results
462  const UNICHARSET &current_unicharset // unicharset that can be used
463  // for id-to-unichar conversion
464  );
465 void print_ratings_info(
466  FILE *fp, // file to use
467  BLOB_CHOICE_LIST *ratings, // list of results
468  const UNICHARSET &current_unicharset // unicharset that can be used
469  // for id-to-unichar conversion
470  );
472  const char *msg,
473  const BLOB_CHOICE_LIST_VECTOR &char_choices,
474  const UNICHARSET &current_unicharset,
475  BOOL8 detailed
476  );
478  WERD_CHOICE *word,
479  GenericVector<WERD_CHOICE *> *alternates);
480 
481 #endif