Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
recogtraining.cpp
Go to the documentation of this file.
1 
2 // File: recogtraining.cpp
3 // Description: Functions for ambiguity and parameter training.
4 // Author: Daria Antonova
5 // Created: Mon Aug 13 11:26:43 PDT 2009
6 //
7 // (C) Copyright 2009, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #include "tesseractclass.h"
21 
22 #include "boxread.h"
23 #include "control.h"
24 #include "cutil.h"
25 #include "host.h"
26 #include "permute.h"
27 #include "ratngs.h"
28 #include "reject.h"
29 #include "stopper.h"
30 
31 namespace tesseract {
32 
34 
35 // Sets flags necessary for recognition in the training mode.
36 // Opens and returns the pointer to the output file.
39  tessedit_tess_adaption_mode.set_value(0); // turn off adaption
40  tessedit_enable_doc_dict.set_value(0); // turn off document dictionary
41  save_blob_choices.set_value(1); // save individual char choices
42  getDict().save_raw_choices.set_value(1); // save raw choices
43  getDict().permute_only_top.set_value(true); // use only top choice permuter
44  tessedit_ok_mode.set_value(0); // turn off context checking
45  // Explore all segmentations.
47  }
48 
49  STRING output_fname = fname;
50  const char *lastdot = strrchr(output_fname.string(), '.');
51  if (lastdot != NULL) output_fname[lastdot - output_fname.string()] = '\0';
52  output_fname += ".txt";
53  FILE *output_file = open_file(output_fname.string(), "a+");
54  return output_file;
55 }
56 
57 // Copies the bounding box from page_res_it->word() to the given TBOX.
58 bool read_t(PAGE_RES_IT *page_res_it, TBOX *tbox) {
59  while (page_res_it->block() != NULL) {
60  if (page_res_it->word() != NULL)
61  break;
62  page_res_it->forward();
63  }
64 
65  if (page_res_it->word() != NULL) {
66  *tbox = page_res_it->word()->word->bounding_box();
67  page_res_it->forward();
68 
69  // If tbox->left() is negative, the training image has vertical text and
70  // all the coordinates of bounding boxes of page_res are rotated by 90
71  // degrees in a counterclockwise direction. We need to rotate the TBOX back
72  // in order to compare with the TBOXes of box files.
73  if (tbox->left() < 0) {
74  tbox->rotate(FCOORD(0.0, -1.0));
75  }
76 
77  return true;
78  } else {
79  return false;
80  }
81 }
82 
83 // This function takes tif/box pair of files and runs recognition on the image,
84 // while making sure that the word bounds that tesseract identified roughly
85 // match to those specified by the input box file. For each word (ngram in a
86 // single bounding box from the input box file) it outputs the ocred result,
87 // the correct label, rating and certainty.
89  PAGE_RES *page_res,
90  volatile ETEXT_DESC *monitor,
91  FILE *output_file) {
92  STRING box_fname = fname;
93  const char *lastdot = strrchr(box_fname.string(), '.');
94  if (lastdot != NULL) box_fname[lastdot - box_fname.string()] = '\0';
95  box_fname += ".box";
96  // read_next_box() will close box_file
97  FILE *box_file = open_file(box_fname.string(), "r");
98 
99  PAGE_RES_IT page_res_it;
100  page_res_it.page_res = page_res;
101  page_res_it.restart_page();
102  STRING label;
103 
104  // Process all the words on this page.
105  TBOX tbox; // tesseract-identified box
106  TBOX bbox; // box from the box file
107  bool keep_going;
108  int line_number = 0;
109  int examined_words = 0;
110  do {
111  keep_going = read_t(&page_res_it, &tbox);
112  keep_going &= ReadNextBox(applybox_page, &line_number, box_file, &label,
113  &bbox);
114  // Align bottom left points of the TBOXes.
115  while (keep_going &&
116  !NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) {
117  keep_going = (bbox.bottom() < tbox.bottom()) ?
118  read_t(&page_res_it, &tbox) :
119  ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
120  }
121  while (keep_going &&
122  !NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) {
123  keep_going = (bbox.left() > tbox.left()) ? read_t(&page_res_it, &tbox) :
124  ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
125  }
126  // OCR the word if top right points of the TBOXes are similar.
127  if (keep_going &&
128  NearlyEqual<int>(tbox.right(), bbox.right(), kMaxBoxEdgeDiff) &&
129  NearlyEqual<int>(tbox.top(), bbox.top(), kMaxBoxEdgeDiff)) {
130  ambigs_classify_and_output(page_res_it.prev_word(),
131  page_res_it.prev_row(),
132  page_res_it.prev_block(),
133  label.string(), output_file);
134  examined_words++;
135  }
136  } while (keep_going);
137 
138  // Set up scripts on all of the words that did not get sent to
139  // ambigs_classify_and_output. They all should have, but if all the
140  // werd_res's don't get uch_sets, tesseract will crash when you try
141  // to iterate over them. :-(
142  int total_words = 0;
143  for (page_res_it.restart_page(); page_res_it.block() != NULL;
144  page_res_it.forward()) {
145  if (page_res_it.word()) {
146  if (page_res_it.word()->uch_set == NULL)
147  page_res_it.word()->SetupFake(unicharset);
148  total_words++;
149  }
150  }
151  if (examined_words < 0.85 * total_words) {
152  tprintf("TODO(antonova): clean up recog_training_segmented; "
153  " It examined only a small fraction of the ambigs image.\n");
154  }
155  tprintf("recog_training_segmented: examined %d / %d words.\n",
156  examined_words, total_words);
157 }
158 
159 // Runs classify_word_pass1() on the current word. Outputs Tesseract's
160 // raw choice as a result of the classification. For words labeled with a
161 // single unichar also outputs all alternatives from blob_choices of the
162 // best choice.
164  ROW_RES *row_res,
165  BLOCK_RES *block_res,
166  const char *label,
167  FILE *output_file) {
168  int offset;
169  // Classify word.
170  fflush(stdout);
171  classify_word_pass1(block_res->block, row_res->row, werd_res);
172  WERD_CHOICE *best_choice = werd_res->best_choice;
173  ASSERT_HOST(best_choice != NULL);
174  ASSERT_HOST(best_choice->blob_choices() != NULL);
175 
176  // Compute the number of unichars in the label.
177  int label_num_unichars = 0;
178  int step = 1; // should be non-zero on the first iteration
179  for (offset = 0; label[offset] != '\0' && step > 0;
180  step = werd_res->uch_set->step(label + offset),
181  offset += step, ++label_num_unichars);
182  if (step == 0) {
183  tprintf("Not outputting illegal unichar %s\n", label);
184  return;
185  }
186 
187  // Output all classifier choices for the unigrams (1->1 classifications).
188  if (label_num_unichars == 1 && best_choice->blob_choices()->length() == 1) {
189  BLOB_CHOICE_LIST_C_IT outer_blob_choice_it;
190  outer_blob_choice_it.set_to_list(best_choice->blob_choices());
191  BLOB_CHOICE_IT blob_choice_it;
192  blob_choice_it.set_to_list(outer_blob_choice_it.data());
193  for (blob_choice_it.mark_cycle_pt();
194  !blob_choice_it.cycled_list();
195  blob_choice_it.forward()) {
196  BLOB_CHOICE *blob_choice = blob_choice_it.data();
197  if (blob_choice->unichar_id() != INVALID_UNICHAR_ID) {
198  fprintf(output_file, "%s\t%s\t%.4f\t%.4f\n",
199  unicharset.id_to_unichar(blob_choice->unichar_id()),
200  label, blob_choice->rating(), blob_choice->certainty());
201  }
202  }
203  }
204  // Output raw choices for many->many and 1->many classifications.
205  getDict().PrintAmbigAlternatives(output_file, label, label_num_unichars);
206 }
207 
208 } // namespace tesseract