Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
paragraphs.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: paragraphs.cpp
3  * Description: Paragraph detection for tesseract.
4  * Author: David Eger
5  * Created: 25 February 2011
6  *
7  * (C) Copyright 2011, Google Inc.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 #ifdef _MSC_VER
20 #define __func__ __FUNCTION__
21 #endif
22 
23 #include <ctype.h>
24 
25 #include "genericvector.h"
26 #include "helpers.h"
27 #include "mutableiterator.h"
28 #include "ocrpara.h"
29 #include "pageres.h"
30 #include "paragraphs.h"
31 #include "paragraphs_internal.h"
32 #include "publictypes.h"
33 #include "ratngs.h"
34 #include "rect.h"
35 #include "statistc.h"
36 #include "strngs.h"
37 #include "tprintf.h"
38 #include "unicharset.h"
39 #include "unicodes.h"
40 
41 namespace tesseract {
42 
43 // The tab vectors for a given line should be ignored if both its tab vectors
44 // are infrequent, specifically, if both tab vectors appear at most once per
45 // kStrayLinePer lines in a block.
46 const int kStrayLinePer = 6;
47 
48 // Special "weak" ParagraphModels.
50  = reinterpret_cast<ParagraphModel *>(0xDEAD111F);
52  = reinterpret_cast<ParagraphModel *>(0xDEAD888F);
53 
54 // Given the width of a typical space between words, what is the threshold
55 // by which by which we think left and right alignments for paragraphs
56 // can vary and still be aligned.
57 static int Epsilon(int space_pix) {
58  return space_pix * 4 / 5;
59 }
60 
61 template<typename T>
62 void SimpleSwap(T &a, T &b) {
63  T c = a;
64  a = b;
65  b = c;
66 }
67 
68 static bool AcceptableRowArgs(
69  int debug_level, int min_num_rows, const char *function_name,
71  int row_start, int row_end) {
72  if (row_start < 0 || row_end > rows->size() || row_start > row_end) {
73  tprintf("Invalid arguments rows[%d, %d) while rows is of size %d.\n",
74  row_start, row_end, rows->size());
75  return false;
76  }
77  if (row_end - row_start < min_num_rows) {
78  if (debug_level > 1) {
79  tprintf("# Too few rows[%d, %d) for %s.\n",
80  row_start, row_end, function_name);
81  }
82  return false;
83  }
84  return true;
85 }
86 
87 // =============================== Debug Code ================================
88 
89 // Convert an integer to a decimal string.
90 static STRING StrOf(int num) {
91  char buffer[30];
92  snprintf(buffer, sizeof(buffer), "%d", num);
93  return STRING(buffer);
94 }
95 
96 // Given a row-major matrix of unicode text and a column separator, print
97 // a formatted table. For ASCII, we get good column alignment.
98 static void PrintTable(const GenericVector<GenericVector<STRING> > &rows,
99  const STRING &colsep) {
100  GenericVector<int> max_col_widths;
101  for (int r = 0; r < rows.size(); r++) {
102  int num_columns = rows[r].size();
103  for (int c = 0; c < num_columns; c++) {
104  int num_unicodes = 0;
105  for (int i = 0; i < rows[r][c].size(); i++) {
106  if ((rows[r][c][i] & 0xC0) != 0x80) num_unicodes++;
107  }
108  if (c >= max_col_widths.size()) {
109  max_col_widths.push_back(num_unicodes);
110  } else {
111  if (num_unicodes > max_col_widths[c])
112  max_col_widths[c] = num_unicodes;
113  }
114  }
115  }
116 
117  GenericVector<STRING> col_width_patterns;
118  for (int c = 0; c < max_col_widths.size(); c++) {
119  col_width_patterns.push_back(
120  STRING("%-") + StrOf(max_col_widths[c]) + "s");
121  }
122 
123  for (int r = 0; r < rows.size(); r++) {
124  for (int c = 0; c < rows[r].size(); c++) {
125  if (c > 0)
126  tprintf("%s", colsep.string());
127  tprintf(col_width_patterns[c].string(), rows[r][c].string());
128  }
129  tprintf("\n");
130  }
131 }
132 
133 STRING RtlEmbed(const STRING &word, bool rtlify) {
134  if (rtlify)
135  return STRING(kRLE) + word + STRING(kPDF);
136  return word;
137 }
138 
139 // Print the current thoughts of the paragraph detector.
140 static void PrintDetectorState(const ParagraphTheory &theory,
144  output.back().push_back("#row");
145  output.back().push_back("space");
146  output.back().push_back("..");
147  output.back().push_back("lword[widthSEL]");
148  output.back().push_back("rword[widthSEL]");
150  output.back().push_back("text");
151 
152  for (int i = 0; i < rows.size(); i++) {
154  GenericVector<STRING> &row = output.back();
155  const RowInfo& ri = *rows[i].ri_;
156  row.push_back(StrOf(i));
157  row.push_back(StrOf(ri.average_interword_space));
158  row.push_back(ri.has_leaders ? ".." : " ");
159  row.push_back(RtlEmbed(ri.lword_text, !ri.ltr) +
160  "[" + StrOf(ri.lword_box.width()) +
161  (ri.lword_likely_starts_idea ? "S" : "s") +
162  (ri.lword_likely_ends_idea ? "E" : "e") +
163  (ri.lword_indicates_list_item ? "L" : "l") +
164  "]");
165  row.push_back(RtlEmbed(ri.rword_text, !ri.ltr) +
166  "[" + StrOf(ri.rword_box.width()) +
167  (ri.rword_likely_starts_idea ? "S" : "s") +
168  (ri.rword_likely_ends_idea ? "E" : "e") +
169  (ri.rword_indicates_list_item ? "L" : "l") +
170  "]");
171  rows[i].AppendDebugInfo(theory, &row);
172  row.push_back(RtlEmbed(ri.text, !ri.ltr));
173  }
174  PrintTable(output, " ");
175 
176  tprintf("Active Paragraph Models:\n");
177  for (int m = 0; m < theory.models().size(); m++) {
178  tprintf(" %d: %s\n", m + 1, theory.models()[m]->ToString().string());
179  }
180 }
181 
182 static void DebugDump(
183  bool should_print,
184  const STRING &phase,
185  const ParagraphTheory &theory,
187  if (!should_print)
188  return;
189  tprintf("# %s\n", phase.string());
190  PrintDetectorState(theory, rows);
191 }
192 
193 // Print out the text for rows[row_start, row_end)
194 static void PrintRowRange(const GenericVector<RowScratchRegisters> &rows,
195  int row_start, int row_end) {
196  tprintf("======================================\n");
197  for (int row = row_start; row < row_end; row++) {
198  tprintf("%s\n", rows[row].ri_->text.string());
199  }
200  tprintf("======================================\n");
201 }
202 
203 // ============= Brain Dead Language Model (ASCII Version) ===================
204 
205 bool IsLatinLetter(int ch) {
206  return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z');
207 }
208 
209 bool IsDigitLike(int ch) {
210  return ch == 'o' || ch == 'O' || ch == 'l' || ch == 'I';
211 }
212 
213 bool IsOpeningPunct(int ch) {
214  return strchr("'\"({[", ch) != NULL;
215 }
216 
217 bool IsTerminalPunct(int ch) {
218  return strchr(":'\".?!]})", ch) != NULL;
219 }
220 
221 // Return a pointer after consuming as much text as qualifies as roman numeral.
222 const char *SkipChars(const char *str, const char *toskip) {
223  while (*str != '\0' && strchr(toskip, *str)) { str++; }
224  return str;
225 }
226 
227 const char *SkipChars(const char *str, bool (*skip)(int)) {
228  while (*str != '\0' && skip(*str)) { str++; }
229  return str;
230 }
231 
232 const char *SkipOne(const char *str, const char *toskip) {
233  if (*str != '\0' && strchr(toskip, *str)) return str + 1;
234  return str;
235 }
236 
237 // Return whether it is very likely that this is a numeral marker that could
238 // start a list item. Some examples include:
239 // A I iii. VI (2) 3.5. [C-4]
240 bool LikelyListNumeral(const STRING &word) {
241  const char *kRomans = "ivxlmdIVXLMD";
242  const char *kDigits = "012345789";
243  const char *kOpen = "[{(";
244  const char *kSep = ":;-.,";
245  const char *kClose = "]})";
246 
247  int num_segments = 0;
248  const char *pos = word.string();
249  while (*pos != '\0' && num_segments < 3) {
250  // skip up to two open parens.
251  const char *numeral_start = SkipOne(SkipOne(pos, kOpen), kOpen);
252  const char *numeral_end = SkipChars(numeral_start, kRomans);
253  if (numeral_end != numeral_start) {
254  // Got Roman Numeral. Great.
255  } else {
256  numeral_end = SkipChars(numeral_start, kDigits);
257  if (numeral_end == numeral_start) {
258  // If there's a single latin letter, we can use that.
259  numeral_end = SkipChars(numeral_start, IsLatinLetter);
260  if (numeral_end - numeral_start != 1)
261  break;
262  }
263  }
264  // We got some sort of numeral.
265  num_segments++;
266  // Skip any trailing parens or punctuation.
267  pos = SkipChars(SkipChars(numeral_end, kClose), kSep);
268  if (pos == numeral_end)
269  break;
270  }
271  return *pos == '\0';
272 }
273 
274 bool LikelyListMark(const STRING &word) {
275  const char *kListMarks = "0Oo*.,+.";
276  return word.size() == 1 && strchr(kListMarks, word[0]) != NULL;
277 }
278 
279 bool AsciiLikelyListItem(const STRING &word) {
280  return LikelyListMark(word) || LikelyListNumeral(word);
281 }
282 
283 // ========== Brain Dead Language Model (Tesseract Version) ================
284 
285 // Return the first Unicode Codepoint from werd[pos].
286 int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos) {
287  if (!u || !werd || pos > werd->length())
288  return 0;
289  return UNICHAR(u->id_to_unichar(werd->unichar_id(pos)), -1).first_uni();
290 }
291 
292 // A useful helper class for finding the first j >= i so that word[j]
293 // does not have given character type.
295  public:
296  UnicodeSpanSkipper(const UNICHARSET *unicharset, const WERD_CHOICE *word)
297  : u_(unicharset), word_(word) { wordlen_ = word->length(); }
298 
299  // Given an input position, return the first position >= pos not punc.
300  int SkipPunc(int pos);
301  // Given an input position, return the first position >= pos not digit.
302  int SkipDigits(int pos);
303  // Given an input position, return the first position >= pos not roman.
304  int SkipRomans(int pos);
305  // Given an input position, return the first position >= pos not alpha.
306  int SkipAlpha(int pos);
307 
308  private:
309  const UNICHARSET *u_;
310  const WERD_CHOICE *word_;
311  int wordlen_;
312 };
313 
315  while (pos < wordlen_ && u_->get_ispunctuation(word_->unichar_id(pos))) pos++;
316  return pos;
317 }
318 
320  while (pos < wordlen_ && (u_->get_isdigit(word_->unichar_id(pos)) ||
321  IsDigitLike(UnicodeFor(u_, word_, pos)))) pos++;
322  return pos;
323 }
324 
326  const char *kRomans = "ivxlmdIVXLMD";
327  while (pos < wordlen_) {
328  int ch = UnicodeFor(u_, word_, pos);
329  if (ch >= 0xF0 || strchr(kRomans, ch) == 0) break;
330  pos++;
331  }
332  return pos;
333 }
334 
336  while (pos < wordlen_ && u_->get_isalpha(word_->unichar_id(pos))) pos++;
337  return pos;
338 }
339 
340 bool LikelyListMarkUnicode(int ch) {
341  if (ch < 0x80) {
342  STRING single_ch;
343  single_ch += ch;
344  return LikelyListMark(single_ch);
345  }
346  switch (ch) {
347  // TODO(eger) expand this list of unicodes as needed.
348  case 0x00B0: // degree sign
349  case 0x2022: // bullet
350  case 0x25E6: // white bullet
351  case 0x00B7: // middle dot
352  case 0x25A1: // white square
353  case 0x25A0: // black square
354  case 0x25AA: // black small square
355  case 0x2B1D: // black very small square
356  case 0x25BA: // black right-pointing pointer
357  case 0x25CF: // black circle
358  case 0x25CB: // white circle
359  return true;
360  default:
361  break; // fall through
362  }
363  return false;
364 }
365 
366 // Return whether it is very likely that this is a numeral marker that could
367 // start a list item. Some examples include:
368 // A I iii. VI (2) 3.5. [C-4]
369 bool UniLikelyListItem(const UNICHARSET *u, const WERD_CHOICE *werd) {
370  if (werd->length() == 1 && LikelyListMarkUnicode(UnicodeFor(u, werd, 0)))
371  return true;
372 
373  UnicodeSpanSkipper m(u, werd);
374  int num_segments = 0;
375  int pos = 0;
376  while (pos < werd->length() && num_segments < 3) {
377  int numeral_start = m.SkipPunc(pos);
378  if (numeral_start > pos + 1) break;
379  int numeral_end = m.SkipRomans(numeral_start);
380  if (numeral_end == numeral_start) {
381  numeral_end = m.SkipDigits(numeral_start);
382  if (numeral_end == numeral_start) {
383  // If there's a single latin letter, we can use that.
384  numeral_end = m.SkipAlpha(numeral_start);
385  if (numeral_end - numeral_start != 1)
386  break;
387  }
388  }
389  // We got some sort of numeral.
390  num_segments++;
391  // Skip any trailing punctuation.
392  pos = m.SkipPunc(numeral_end);
393  if (pos == numeral_end)
394  break;
395  }
396  return pos == werd->length();
397 }
398 
399 // ========= Brain Dead Language Model (combined entry points) ================
400 
401 // Given the leftmost word of a line either as a Tesseract unicharset + werd
402 // or a utf8 string, set the following attributes for it:
403 // is_list - this word might be a list number or bullet.
404 // starts_idea - this word is likely to start a sentence.
405 // ends_idea - this word is likely to end a sentence.
406 void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,
407  const STRING &utf8,
408  bool *is_list, bool *starts_idea, bool *ends_idea) {
409  *is_list = false;
410  *starts_idea = false;
411  *ends_idea = false;
412  if (utf8.size() == 0 || (werd != NULL && werd->length() == 0)) { // Empty
413  *ends_idea = true;
414  return;
415  }
416 
417  if (unicharset && werd) { // We have a proper werd and unicharset so use it.
418  if (UniLikelyListItem(unicharset, werd)) {
419  *is_list = true;
420  *starts_idea = true;
421  *ends_idea = true;
422  }
423  if (unicharset->get_isupper(werd->unichar_id(0))) {
424  *starts_idea = true;
425  }
426  if (unicharset->get_ispunctuation(werd->unichar_id(0))) {
427  *starts_idea = true;
428  *ends_idea = true;
429  }
430  } else { // Assume utf8 is mostly ASCII
431  if (AsciiLikelyListItem(utf8)) {
432  *is_list = true;
433  *starts_idea = true;
434  }
435  int start_letter = utf8[0];
436  if (IsOpeningPunct(start_letter)) {
437  *starts_idea = true;
438  }
439  if (IsTerminalPunct(start_letter)) {
440  *ends_idea = true;
441  }
442  if (start_letter >= 'A' && start_letter <= 'Z') {
443  *starts_idea = true;
444  }
445  }
446 }
447 
448 // Given the rightmost word of a line either as a Tesseract unicharset + werd
449 // or a utf8 string, set the following attributes for it:
450 // is_list - this word might be a list number or bullet.
451 // starts_idea - this word is likely to start a sentence.
452 // ends_idea - this word is likely to end a sentence.
453 void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,
454  const STRING &utf8,
455  bool *is_list, bool *starts_idea, bool *ends_idea) {
456  *is_list = false;
457  *starts_idea = false;
458  *ends_idea = false;
459  if (utf8.size() == 0 || (werd != NULL && werd->length() == 0)) { // Empty
460  *ends_idea = true;
461  return;
462  }
463 
464  if (unicharset && werd) { // We have a proper werd and unicharset so use it.
465  if (UniLikelyListItem(unicharset, werd)) {
466  *is_list = true;
467  *starts_idea = true;
468  }
469  UNICHAR_ID last_letter = werd->unichar_id(werd->length() - 1);
470  if (unicharset->get_ispunctuation(last_letter)) {
471  *ends_idea = true;
472  }
473  } else { // Assume utf8 is mostly ASCII
474  if (AsciiLikelyListItem(utf8)) {
475  *is_list = true;
476  *starts_idea = true;
477  }
478  int last_letter = utf8[utf8.size() - 1];
479  if (IsOpeningPunct(last_letter) || IsTerminalPunct(last_letter)) {
480  *ends_idea = true;
481  }
482  }
483 }
484 
485 // =============== Implementation of RowScratchRegisters =====================
486 /* static */
488  GenericVector<STRING> *header) {
489  header->push_back("[lmarg,lind;rind,rmarg]");
490  header->push_back("model");
491 }
492 
494  GenericVector<STRING> *dbg) const {
495  char s[30];
496  snprintf(s, sizeof(s), "[%3d,%3d;%3d,%3d]",
498  dbg->push_back(s);
499  STRING model_string;
500  model_string += static_cast<char>(GetLineType());
501  model_string += ":";
502 
503  int model_numbers = 0;
504  for (int h = 0; h < hypotheses_.size(); h++) {
505  if (hypotheses_[h].model == NULL)
506  continue;
507  if (model_numbers > 0)
508  model_string += ",";
509  if (StrongModel(hypotheses_[h].model)) {
510  model_string += StrOf(1 + theory.IndexOf(hypotheses_[h].model));
511  } else if (hypotheses_[h].model == kCrownLeft) {
512  model_string += "CrL";
513  } else if (hypotheses_[h].model == kCrownRight) {
514  model_string += "CrR";
515  }
516  model_numbers++;
517  }
518  if (model_numbers == 0)
519  model_string += "0";
520 
521  dbg->push_back(model_string);
522 }
523 
525  ri_ = &row;
526  lmargin_ = 0;
527  lindent_ = row.pix_ldistance;
528  rmargin_ = 0;
529  rindent_ = row.pix_rdistance;
530 }
531 
533  if (hypotheses_.empty())
534  return LT_UNKNOWN;
535  bool has_start = false;
536  bool has_body = false;
537  for (int i = 0; i < hypotheses_.size(); i++) {
538  switch (hypotheses_[i].ty) {
539  case LT_START: has_start = true; break;
540  case LT_BODY: has_body = true; break;
541  default:
542  tprintf("Encountered bad value in hypothesis list: %c\n",
543  hypotheses_[i].ty);
544  break;
545  }
546  }
547  if (has_start && has_body)
548  return LT_MULTIPLE;
549  return has_start ? LT_START : LT_BODY;
550 }
551 
553  if (hypotheses_.empty())
554  return LT_UNKNOWN;
555  bool has_start = false;
556  bool has_body = false;
557  for (int i = 0; i < hypotheses_.size(); i++) {
558  if (hypotheses_[i].model != model)
559  continue;
560  switch (hypotheses_[i].ty) {
561  case LT_START: has_start = true; break;
562  case LT_BODY: has_body = true; break;
563  default:
564  tprintf("Encountered bad value in hypothesis list: %c\n",
565  hypotheses_[i].ty);
566  break;
567  }
568  }
569  if (has_start && has_body)
570  return LT_MULTIPLE;
571  return has_start ? LT_START : LT_BODY;
572 }
573 
575  LineType current_lt = GetLineType();
576  if (current_lt != LT_UNKNOWN && current_lt != LT_START) {
577  tprintf("Trying to set a line to be START when it's already BODY.\n");
578  }
579  if (current_lt == LT_UNKNOWN || current_lt == LT_BODY) {
580  hypotheses_.push_back_new(LineHypothesis(LT_START, NULL));
581  }
582 }
583 
585  LineType current_lt = GetLineType();
586  if (current_lt != LT_UNKNOWN && current_lt != LT_BODY) {
587  tprintf("Trying to set a line to be BODY when it's already START.\n");
588  }
589  if (current_lt == LT_UNKNOWN || current_lt == LT_START) {
590  hypotheses_.push_back_new(LineHypothesis(LT_BODY, NULL));
591  }
592 }
593 
595  hypotheses_.push_back_new(LineHypothesis(LT_START, model));
596  int old_idx = hypotheses_.get_index(LineHypothesis(LT_START, NULL));
597  if (old_idx >= 0)
598  hypotheses_.remove(old_idx);
599 }
600 
602  hypotheses_.push_back_new(LineHypothesis(LT_BODY, model));
603  int old_idx = hypotheses_.get_index(LineHypothesis(LT_BODY, NULL));
604  if (old_idx >= 0)
605  hypotheses_.remove(old_idx);
606 }
607 
609  for (int h = 0; h < hypotheses_.size(); h++) {
610  if (hypotheses_[h].ty == LT_START && StrongModel(hypotheses_[h].model))
611  models->push_back_new(hypotheses_[h].model);
612  }
613 }
614 
616  for (int h = 0; h < hypotheses_.size(); h++) {
617  if (StrongModel(hypotheses_[h].model))
618  models->push_back_new(hypotheses_[h].model);
619  }
620 }
621 
623  for (int h = 0; h < hypotheses_.size(); h++) {
624  if (hypotheses_[h].model != NULL)
625  models->push_back_new(hypotheses_[h].model);
626  }
627 }
628 
630  if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_START)
631  return NULL;
632  return hypotheses_[0].model;
633 }
634 
636  if (hypotheses_.size() != 1 || hypotheses_[0].ty != LT_BODY)
637  return NULL;
638  return hypotheses_[0].model;
639 }
640 
641 // Discard any hypotheses whose model is not in the given list.
643  const SetOfModels &models) {
644  if (models.empty())
645  return;
646  for (int h = hypotheses_.size() - 1; h >= 0; h--) {
647  if (!models.contains(hypotheses_[h].model)) {
648  hypotheses_.remove(h);
649  }
650  }
651 }
652 
653 // ============ Geometry based Paragraph Detection Algorithm =================
654 
655 struct Cluster {
656  Cluster() : center(0), count(0) {}
657  Cluster(int cen, int num) : center(cen), count(num) {}
658 
659  int center; // The center of the cluster.
660  int count; // The number of entries within the cluster.
661 };
662 
664  public:
665  explicit SimpleClusterer(int max_cluster_width)
666  : max_cluster_width_(max_cluster_width) {}
667  void Add(int value) { values_.push_back(value); }
668  int size() const { return values_.size(); }
669  void GetClusters(GenericVector<Cluster> *clusters);
670 
671  private:
672  int max_cluster_width_;
673  GenericVectorEqEq<int> values_;
674 };
675 
676 // Return the index of the cluster closest to value.
677 int ClosestCluster(const GenericVector<Cluster> &clusters, int value) {
678  int best_index = 0;
679  for (int i = 0; i < clusters.size(); i++) {
680  if (abs(value - clusters[i].center) <
681  abs(value - clusters[best_index].center))
682  best_index = i;
683  }
684  return best_index;
685 }
686 
688  clusters->clear();
689  values_.sort();
690  for (int i = 0; i < values_.size();) {
691  int orig_i = i;
692  int lo = values_[i];
693  int hi = lo;
694  while (++i < values_.size() && values_[i] <= lo + max_cluster_width_) {
695  hi = values_[i];
696  }
697  clusters->push_back(Cluster((hi + lo) / 2, i - orig_i));
698  }
699 }
700 
701 // Calculate left- and right-indent tab stop values seen in
702 // rows[row_start, row_end) given a tolerance of tolerance.
704  int row_start, int row_end,
705  int tolerance,
706  GenericVector<Cluster> *left_tabs,
707  GenericVector<Cluster> *right_tabs) {
708  if (!AcceptableRowArgs(0, 1, __func__, rows, row_start, row_end))
709  return;
710  // First pass: toss all left and right indents into clusterers.
711  SimpleClusterer initial_lefts(tolerance);
712  SimpleClusterer initial_rights(tolerance);
713  GenericVector<Cluster> initial_left_tabs;
714  GenericVector<Cluster> initial_right_tabs;
715  for (int i = row_start; i < row_end; i++) {
716  initial_lefts.Add((*rows)[i].lindent_);
717  initial_rights.Add((*rows)[i].rindent_);
718  }
719  initial_lefts.GetClusters(&initial_left_tabs);
720  initial_rights.GetClusters(&initial_right_tabs);
721 
722  // Second pass: cluster only lines that are not "stray"
723  // An example of a stray line is a page number -- a line whose start
724  // and end tab-stops are far outside the typical start and end tab-stops
725  // for the block.
726  // Put another way, we only cluster data from lines whose start or end
727  // tab stop is frequent.
728  SimpleClusterer lefts(tolerance);
729  SimpleClusterer rights(tolerance);
730  int infrequent_enough_to_ignore = (row_end - row_start) / kStrayLinePer;
731  for (int i = row_start; i < row_end; i++) {
732  int lidx = ClosestCluster(initial_left_tabs, (*rows)[i].lindent_);
733  int ridx = ClosestCluster(initial_right_tabs, (*rows)[i].rindent_);
734  if (initial_left_tabs[lidx].count > infrequent_enough_to_ignore ||
735  initial_right_tabs[ridx].count > infrequent_enough_to_ignore) {
736  lefts.Add((*rows)[i].lindent_);
737  rights.Add((*rows)[i].rindent_);
738  }
739  }
740  lefts.GetClusters(left_tabs);
741  rights.GetClusters(right_tabs);
742 }
743 
744 // Given a paragraph model mark rows[row_start, row_end) as said model
745 // start or body lines.
746 //
747 // Case 1: model->first_indent_ != model->body_indent_
748 // Differentiating the paragraph start lines from the paragraph body lines in
749 // this case is easy, we just see how far each line is indented.
750 //
751 // Case 2: model->first_indent_ == model->body_indent_
752 // Here, we find end-of-paragraph lines by looking for "short lines."
753 // What constitutes a "short line" changes depending on whether the text
754 // ragged-right[left] or fully justified (aligned left and right).
755 //
756 // Case 2a: Ragged Right (or Left) text. (eop_threshold == 0)
757 // We have a new paragraph it the first word would have at the end
758 // of the previous line.
759 //
760 // Case 2b: Fully Justified. (eop_threshold > 0)
761 // We mark a line as short (end of paragraph) if the offside indent
762 // is greater than eop_threshold.
764  int row_start, int row_end,
765  const ParagraphModel *model,
766  bool ltr,
767  int eop_threshold) {
768  if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end))
769  return;
770  for (int row = row_start; row < row_end; row++) {
771  bool valid_first = ValidFirstLine(rows, row, model);
772  bool valid_body = ValidBodyLine(rows, row, model);
773  if (valid_first && !valid_body) {
774  (*rows)[row].AddStartLine(model);
775  } else if (valid_body && !valid_first) {
776  (*rows)[row].AddBodyLine(model);
777  } else if (valid_body && valid_first) {
778  bool after_eop = (row == row_start);
779  if (row > row_start) {
780  if (eop_threshold > 0) {
781  if (model->justification() == JUSTIFICATION_LEFT) {
782  after_eop = (*rows)[row - 1].rindent_ > eop_threshold;
783  } else {
784  after_eop = (*rows)[row - 1].lindent_ > eop_threshold;
785  }
786  } else {
787  after_eop = FirstWordWouldHaveFit((*rows)[row - 1], (*rows)[row],
788  model->justification());
789  }
790  }
791  if (after_eop) {
792  (*rows)[row].AddStartLine(model);
793  } else {
794  (*rows)[row].AddBodyLine(model);
795  }
796  } else {
797  // Do nothing. Stray row.
798  }
799  }
800 }
801 
802 // GeometricClassifierState holds all of the information we'll use while
803 // trying to determine a paragraph model for the text lines in a block of
804 // text:
805 // + the rows under consideration [row_start, row_end)
806 // + the common left- and right-indent tab stops
807 // + does the block start out left-to-right or right-to-left
808 // Further, this struct holds the data we amass for the (single) ParagraphModel
809 // we'll assign to the text lines (assuming we get that far).
813  int r_start, int r_end)
814  : debug_level(dbg_level), rows(r), row_start(r_start), row_end(r_end),
815  margin(0) {
816  tolerance = InterwordSpace(*r, r_start, r_end);
817  CalculateTabStops(r, r_start, r_end, tolerance,
818  &left_tabs, &right_tabs);
819  ltr = (*r)[r_start].ri_->ltr;
820  }
821 
824  margin = (*rows)[row_start].lmargin_;
825  }
826 
829  margin = (*rows)[row_start].rmargin_;
830  }
831 
832  // Align tabs are the tab stops the text is aligned to.
835  return left_tabs;
836  }
837 
838  // Offside tabs are the tab stops opposite the tabs used to align the text.
839  //
840  // Note that for a left-to-right text which is aligned to the right such as
841  // this function comment, the offside tabs are the horizontal tab stops
842  // marking the beginning of ("Note", "this" and "marking").
845  return right_tabs;
846  }
847 
848  // Return whether the i'th row extends from the leftmost left tab stop
849  // to the right most right tab stop.
850  bool IsFullRow(int i) const {
851  return ClosestCluster(left_tabs, (*rows)[i].lindent_) == 0 &&
852  ClosestCluster(right_tabs, (*rows)[i].rindent_) == 0;
853  }
854 
855  int AlignsideTabIndex(int row_idx) const {
856  return ClosestCluster(AlignTabs(), (*rows)[row_idx].AlignsideIndent(just));
857  }
858 
859  // Given what we know about the paragraph justification (just), would the
860  // first word of row_b have fit at the end of row_a?
861  bool FirstWordWouldHaveFit(int row_a, int row_b) {
863  (*rows)[row_a], (*rows)[row_b], just);
864  }
865 
866  void PrintRows() const { PrintRowRange(*rows, row_start, row_end); }
867 
868  void Fail(int min_debug_level, const char *why) const {
869  if (debug_level < min_debug_level) return;
870  tprintf("# %s\n", why);
871  PrintRows();
872  }
873 
876  }
877 
878  // We print out messages with a debug level at least as great as debug_level.
880 
881  // The Geometric Classifier was asked to find a single paragraph model
882  // to fit the text rows (*rows)[row_start, row_end)
885  int row_end;
886 
887  // The amount by which we expect the text edge can vary and still be aligned.
889 
890  // Is the script in this text block left-to-right?
891  // HORRIBLE ROUGH APPROXIMATION. TODO(eger): Improve
892  bool ltr;
893 
894  // These left and right tab stops were determined to be the common tab
895  // stops for the given text.
898 
899  // These are parameters we must determine to create a ParagraphModel.
901  int margin;
904 
905  // eop_threshold > 0 if the text is fully justified. See MarkRowsWithModel()
907 };
908 
909 // Given a section of text where strong textual clues did not help identifying
910 // paragraph breaks, and for which the left and right indents have exactly
911 // three tab stops between them, attempt to find the paragraph breaks based
912 // solely on the outline of the text and whether the script is left-to-right.
913 //
914 // Algorithm Detail:
915 // The selected rows are in the form of a rectangle except
916 // for some number of "short lines" of the same length:
917 //
918 // (A1) xxxxxxxxxxxxx (B1) xxxxxxxxxxxx
919 // xxxxxxxxxxx xxxxxxxxxx # A "short" line.
920 // xxxxxxxxxxxxx xxxxxxxxxxxx
921 // xxxxxxxxxxxxx xxxxxxxxxxxx
922 //
923 // We have a slightly different situation if the only short
924 // line is at the end of the excerpt.
925 //
926 // (A2) xxxxxxxxxxxxx (B2) xxxxxxxxxxxx
927 // xxxxxxxxxxxxx xxxxxxxxxxxx
928 // xxxxxxxxxxxxx xxxxxxxxxxxx
929 // xxxxxxxxxxx xxxxxxxxxx # A "short" line.
930 //
931 // We'll interpret these as follows based on the reasoning in the comment for
932 // GeometricClassify():
933 // [script direction: first indent, body indent]
934 // (A1) LtR: 2,0 RtL: 0,0 (B1) LtR: 0,0 RtL: 2,0
935 // (A2) LtR: 2,0 RtL: CrR (B2) LtR: CrL RtL: 2,0
937  int debug_level,
939  ParagraphTheory *theory) {
940  int num_rows = s.row_end - s.row_start;
941  int num_full_rows = 0;
942  int last_row_full = 0;
943  for (int i = s.row_start; i < s.row_end; i++) {
944  if (s.IsFullRow(i)) {
945  num_full_rows++;
946  if (i == s.row_end - 1) last_row_full++;
947  }
948  }
949 
950  if (num_full_rows < 0.7 * num_rows) {
951  s.Fail(1, "Not enough full lines to know which lines start paras.");
952  return;
953  }
954 
955  // eop_threshold gets set if we're fully justified; see MarkRowsWithModel()
956  s.eop_threshold = 0;
957 
958  if (s.ltr) {
960  } else {
962  }
963 
964  if (debug_level > 0) {
965  tprintf("# Not enough variety for clear outline classification. "
966  "Guessing these are %s aligned based on script.\n",
967  s.ltr ? "left" : "right");
968  s.PrintRows();
969  }
970 
971  if (s.AlignTabs().size() == 2) { // case A1 or A2
972  s.first_indent = s.AlignTabs()[1].center;
973  s.body_indent = s.AlignTabs()[0].center;
974  } else { // case B1 or B2
975  if (num_rows - 1 == num_full_rows - last_row_full) {
976  // case B2
977  const ParagraphModel *model = s.ltr ? kCrownLeft : kCrownRight;
978  (*s.rows)[s.row_start].AddStartLine(model);
979  for (int i = s.row_start + 1; i < s.row_end; i++) {
980  (*s.rows)[i].AddBodyLine(model);
981  }
982  return;
983  } else {
984  // case B1
985  s.first_indent = s.body_indent = s.AlignTabs()[0].center;
986  s.eop_threshold = (s.OffsideTabs()[0].center +
987  s.OffsideTabs()[1].center) / 2;
988  }
989  }
990  const ParagraphModel *model = theory->AddModel(s.Model());
991  MarkRowsWithModel(s.rows, s.row_start, s.row_end, model,
992  s.ltr, s.eop_threshold);
993  return;
994 }
995 
996 // This function is called if strong textual clues were not available, but
997 // the caller hopes that the paragraph breaks will be super obvious just
998 // by the outline of the text.
999 //
1000 // The particularly difficult case is figuring out what's going on if you
1001 // don't have enough short paragraph end lines to tell us what's going on.
1002 //
1003 // For instance, let's say you have the following outline:
1004 //
1005 // (A1) xxxxxxxxxxxxxxxxxxxxxx
1006 // xxxxxxxxxxxxxxxxxxxx
1007 // xxxxxxxxxxxxxxxxxxxxxx
1008 // xxxxxxxxxxxxxxxxxxxxxx
1009 //
1010 // Even if we know that the text is left-to-right and so will probably be
1011 // left-aligned, both of the following are possible texts:
1012 //
1013 // (A1a) 1. Here our list item
1014 // with two full lines.
1015 // 2. Here a second item.
1016 // 3. Here our third one.
1017 //
1018 // (A1b) so ends paragraph one.
1019 // Here starts another
1020 // paragraph we want to
1021 // read. This continues
1022 //
1023 // These examples are obvious from the text and should have been caught
1024 // by the StrongEvidenceClassify pass. However, for languages where we don't
1025 // have capital letters to go on (e.g. Hebrew, Arabic, Hindi, Chinese),
1026 // it's worth guessing that (A1b) is the correct interpretation if there are
1027 // far more "full" lines than "short" lines.
1028 void GeometricClassify(int debug_level,
1030  int row_start, int row_end,
1031  ParagraphTheory *theory) {
1032  if (!AcceptableRowArgs(debug_level, 4, __func__, rows, row_start, row_end))
1033  return;
1034  if (debug_level > 1) {
1035  tprintf("###############################################\n");
1036  tprintf("##### GeometricClassify( rows[%d:%d) ) ####\n",
1037  row_start, row_end);
1038  tprintf("###############################################\n");
1039  }
1040  RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10);
1041 
1042  GeometricClassifierState s(debug_level, rows, row_start, row_end);
1043  if (s.left_tabs.size() > 2 && s.right_tabs.size() > 2) {
1044  s.Fail(2, "Too much variety for simple outline classification.");
1045  return;
1046  }
1047  if (s.left_tabs.size() <= 1 && s.right_tabs.size() <= 1) {
1048  s.Fail(1, "Not enough variety for simple outline classification.");
1049  return;
1050  }
1051  if (s.left_tabs.size() + s.right_tabs.size() == 3) {
1052  GeometricClassifyThreeTabStopTextBlock(debug_level, s, theory);
1053  return;
1054  }
1055 
1056  // At this point, we know that one side has at least two tab stops, and the
1057  // other side has one or two tab stops.
1058  // Left to determine:
1059  // (1) Which is the body indent and which is the first line indent?
1060  // (2) Is the text fully justified?
1061 
1062  // If one side happens to have three or more tab stops, assume that side
1063  // is opposite of the aligned side.
1064  if (s.right_tabs.size() > 2) {
1066  } else if (s.left_tabs.size() > 2) {
1068  } else if (s.ltr) { // guess based on script direction
1070  } else {
1072  }
1073 
1074  if (s.AlignTabs().size() == 2) {
1075  // For each tab stop on the aligned side, how many of them appear
1076  // to be paragraph start lines? [first lines]
1077  int firsts[2] = {0, 0};
1078  // Count the first line as a likely paragraph start line.
1079  firsts[s.AlignsideTabIndex(s.row_start)]++;
1080  // For each line, if the first word would have fit on the previous
1081  // line count it as a likely paragraph start line.
1082  for (int i = s.row_start + 1; i < s.row_end; i++) {
1083  if (s.FirstWordWouldHaveFit(i - 1, i)) {
1084  firsts[s.AlignsideTabIndex(i)]++;
1085  }
1086  }
1087  // Make an extra accounting for the last line of the paragraph just
1088  // in case it's the only short line in the block. That is, take its
1089  // first word as typical and see if this looks like the *last* line
1090  // of a paragraph. If so, mark the *other* indent as probably a first.
1091  if (s.FirstWordWouldHaveFit(s.row_end - 1, s.row_end - 1)) {
1092  firsts[1 - s.AlignsideTabIndex(s.row_end - 1)]++;
1093  }
1094 
1095  int percent0firsts, percent1firsts;
1096  percent0firsts = (100 * firsts[0]) / s.AlignTabs()[0].count;
1097  percent1firsts = (100 * firsts[1]) / s.AlignTabs()[1].count;
1098 
1099  // TODO(eger): Tune these constants if necessary.
1100  if ((percent0firsts < 20 && 30 < percent1firsts) ||
1101  percent0firsts + 30 < percent1firsts) {
1102  s.first_indent = s.AlignTabs()[1].center;
1103  s.body_indent = s.AlignTabs()[0].center;
1104  } else if ((percent1firsts < 20 && 30 < percent0firsts) ||
1105  percent1firsts + 30 < percent0firsts) {
1106  s.first_indent = s.AlignTabs()[0].center;
1107  s.body_indent = s.AlignTabs()[1].center;
1108  } else {
1109  // Ambiguous! Probably lineated (poetry)
1110  if (debug_level > 1) {
1111  tprintf("# Cannot determine %s indent likely to start paragraphs.\n",
1112  s.just == tesseract::JUSTIFICATION_LEFT ? "left" : "right");
1113  tprintf("# Indent of %d looks like a first line %d%% of the time.\n",
1114  s.AlignTabs()[0].center, percent0firsts);
1115  tprintf("# Indent of %d looks like a first line %d%% of the time.\n",
1116  s.AlignTabs()[1].center, percent1firsts);
1117  s.PrintRows();
1118  }
1119  return;
1120  }
1121  } else {
1122  // There's only one tab stop for the "aligned to" side.
1123  s.first_indent = s.body_indent = s.AlignTabs()[0].center;
1124  }
1125 
1126  // At this point, we have our model.
1127  const ParagraphModel *model = theory->AddModel(s.Model());
1128 
1129  // Now all we have to do is figure out if the text is fully justified or not.
1130  // eop_threshold: default to fully justified unless we see evidence below.
1131  // See description on MarkRowsWithModel()
1132  s.eop_threshold =
1133  (s.OffsideTabs()[0].center + s.OffsideTabs()[1].center) / 2;
1134  // If the text is not fully justified, re-set the eop_threshold to 0.
1135  if (s.AlignTabs().size() == 2) {
1136  // Paragraphs with a paragraph-start indent.
1137  for (int i = s.row_start; i < s.row_end - 1; i++) {
1138  if (ValidFirstLine(s.rows, i + 1, model) &&
1139  !NearlyEqual(s.OffsideTabs()[0].center,
1140  (*s.rows)[i].OffsideIndent(s.just), s.tolerance)) {
1141  // We found a non-end-of-paragraph short line: not fully justified.
1142  s.eop_threshold = 0;
1143  break;
1144  }
1145  }
1146  } else {
1147  // Paragraphs with no paragraph-start indent.
1148  for (int i = s.row_start; i < s.row_end - 1; i++) {
1149  if (!s.FirstWordWouldHaveFit(i, i + 1) &&
1150  !NearlyEqual(s.OffsideTabs()[0].center,
1151  (*s.rows)[i].OffsideIndent(s.just), s.tolerance)) {
1152  // We found a non-end-of-paragraph short line: not fully justified.
1153  s.eop_threshold = 0;
1154  break;
1155  }
1156  }
1157  }
1158  MarkRowsWithModel(rows, row_start, row_end, model, s.ltr, s.eop_threshold);
1159 }
1160 
1161 // =============== Implementation of ParagraphTheory =====================
1162 
1164  for (int i = 0; i < models_->size(); i++) {
1165  if ((*models_)[i]->Comparable(model))
1166  return (*models_)[i];
1167  }
1168  ParagraphModel *m = new ParagraphModel(model);
1169  models_->push_back(m);
1170  models_we_added_.push_back_new(m);
1171  return m;
1172 }
1173 
1175  for (int i = models_->size() - 1; i >= 0; i--) {
1176  ParagraphModel *m = (*models_)[i];
1177  if (!used_models.contains(m) && models_we_added_.contains(m)) {
1178  delete m;
1179  models_->remove(i);
1180  models_we_added_.remove(models_we_added_.get_index(m));
1181  }
1182  }
1183 }
1184 
1185 // Examine rows[start, end) and try to determine if an existing non-centered
1186 // paragraph model would fit them perfectly. If so, return a pointer to it.
1187 // If not, return NULL.
1189  const GenericVector<RowScratchRegisters> *rows, int start, int end) const {
1190  for (int m = 0; m < models_->size(); m++) {
1191  const ParagraphModel *model = (*models_)[m];
1192  if (model->justification() != JUSTIFICATION_CENTER &&
1193  RowsFitModel(rows, start, end, model))
1194  return model;
1195  }
1196  return NULL;
1197 }
1198 
1200  for (int m = 0; m < models_->size(); m++) {
1201  const ParagraphModel *model = (*models_)[m];
1202  if (model->justification() != JUSTIFICATION_CENTER)
1203  models->push_back_new(model);
1204  }
1205 }
1206 
1207 int ParagraphTheory::IndexOf(const ParagraphModel *model) const {
1208  for (int i = 0; i < models_->size(); i++) {
1209  if ((*models_)[i] == model)
1210  return i;
1211  }
1212  return -1;
1213 }
1214 
1216  int row, const ParagraphModel *model) {
1217  if (!StrongModel(model)) {
1218  tprintf("ValidFirstLine() should only be called with strong models!\n");
1219  }
1220  return StrongModel(model) &&
1221  model->ValidFirstLine(
1222  (*rows)[row].lmargin_, (*rows)[row].lindent_,
1223  (*rows)[row].rindent_, (*rows)[row].rmargin_);
1224 }
1225 
1227  int row, const ParagraphModel *model) {
1228  if (!StrongModel(model)) {
1229  tprintf("ValidBodyLine() should only be called with strong models!\n");
1230  }
1231  return StrongModel(model) &&
1232  model->ValidBodyLine(
1233  (*rows)[row].lmargin_, (*rows)[row].lindent_,
1234  (*rows)[row].rindent_, (*rows)[row].rmargin_);
1235 }
1236 
1238  int a, int b, const ParagraphModel *model) {
1239  if (model != kCrownRight && model != kCrownLeft) {
1240  tprintf("CrownCompatible() should only be called with crown models!\n");
1241  return false;
1242  }
1243  RowScratchRegisters &row_a = (*rows)[a];
1244  RowScratchRegisters &row_b = (*rows)[b];
1245  if (model == kCrownRight) {
1246  return NearlyEqual(row_a.rindent_ + row_a.rmargin_,
1247  row_b.rindent_ + row_b.rmargin_,
1248  Epsilon(row_a.ri_->average_interword_space));
1249  }
1250  return NearlyEqual(row_a.lindent_ + row_a.lmargin_,
1251  row_b.lindent_ + row_b.lmargin_,
1252  Epsilon(row_a.ri_->average_interword_space));
1253 }
1254 
1255 
1256 // =============== Implementation of ParagraphModelSmearer ====================
1257 
1260  int row_start, int row_end, ParagraphTheory *theory)
1261  : theory_(theory), rows_(rows), row_start_(row_start),
1262  row_end_(row_end) {
1263  if (!AcceptableRowArgs(0, 0, __func__, rows, row_start, row_end)) {
1264  row_start_ = 0;
1265  row_end_ = 0;
1266  return;
1267  }
1268  SetOfModels no_models;
1269  for (int row = row_start - 1; row <= row_end; row++) {
1270  open_models_.push_back(no_models);
1271  }
1272 }
1273 
1274 // see paragraphs_internal.h
1275 void ParagraphModelSmearer::CalculateOpenModels(int row_start, int row_end) {
1276  SetOfModels no_models;
1277  if (row_start < row_start_) row_start = row_start_;
1278  if (row_end > row_end_) row_end = row_end_;
1279 
1280  for (int row = (row_start > 0) ? row_start - 1 : row_start; row < row_end;
1281  row++) {
1282  if ((*rows_)[row].ri_->num_words == 0) {
1283  OpenModels(row + 1) = no_models;
1284  } else {
1285  SetOfModels &opened = OpenModels(row);
1286  (*rows_)[row].StartHypotheses(&opened);
1287 
1288  // Which models survive the transition from row to row + 1?
1289  SetOfModels still_open;
1290  for (int m = 0; m < opened.size(); m++) {
1291  if (ValidFirstLine(rows_, row, opened[m]) ||
1292  ValidBodyLine(rows_, row, opened[m])) {
1293  // This is basic filtering; we check likely paragraph starty-ness down
1294  // below in Smear() -- you know, whether the first word would have fit
1295  // and such.
1296  still_open.push_back_new(opened[m]);
1297  }
1298  }
1299  OpenModels(row + 1) = still_open;
1300  }
1301  }
1302 }
1303 
1304 // see paragraphs_internal.h
1306  CalculateOpenModels(row_start_, row_end_);
1307 
1308  // For each row which we're unsure about (that is, it is LT_UNKNOWN or
1309  // we have multiple LT_START hypotheses), see if there's a model that
1310  // was recently used (an "open" model) which might model it well.
1311  for (int i = row_start_; i < row_end_; i++) {
1312  RowScratchRegisters &row = (*rows_)[i];
1313  if (row.ri_->num_words == 0)
1314  continue;
1315 
1316  // Step One:
1317  // Figure out if there are "open" models which are left-alined or
1318  // right-aligned. This is important for determining whether the
1319  // "first" word in a row would fit at the "end" of the previous row.
1320  bool left_align_open = false;
1321  bool right_align_open = false;
1322  for (int m = 0; m < OpenModels(i).size(); m++) {
1323  switch (OpenModels(i)[m]->justification()) {
1324  case JUSTIFICATION_LEFT: left_align_open = true; break;
1325  case JUSTIFICATION_RIGHT: right_align_open = true; break;
1326  default: left_align_open = right_align_open = true;
1327  }
1328  }
1329  // Step Two:
1330  // Use that knowledge to figure out if this row is likely to
1331  // start a paragraph.
1332  bool likely_start;
1333  if (i == 0) {
1334  likely_start = true;
1335  } else {
1336  if ((left_align_open && right_align_open) ||
1337  (!left_align_open && !right_align_open)) {
1338  likely_start = LikelyParagraphStart((*rows_)[i - 1], row,
1339  JUSTIFICATION_LEFT) ||
1340  LikelyParagraphStart((*rows_)[i - 1], row,
1342  } else if (left_align_open) {
1343  likely_start = LikelyParagraphStart((*rows_)[i - 1], row,
1345  } else {
1346  likely_start = LikelyParagraphStart((*rows_)[i - 1], row,
1348  }
1349  }
1350 
1351  // Step Three:
1352  // If this text line seems like an obvious first line of an
1353  // open model, or an obvious continuation of an existing
1354  // modelled paragraph, mark it up.
1355  if (likely_start) {
1356  // Add Start Hypotheses for all Open models that fit.
1357  for (int m = 0; m < OpenModels(i).size(); m++) {
1358  if (ValidFirstLine(rows_, i, OpenModels(i)[m])) {
1359  row.AddStartLine(OpenModels(i)[m]);
1360  }
1361  }
1362  } else {
1363  // Add relevant body line hypotheses.
1364  SetOfModels last_line_models;
1365  if (i > 0) {
1366  (*rows_)[i - 1].StrongHypotheses(&last_line_models);
1367  } else {
1368  theory_->NonCenteredModels(&last_line_models);
1369  }
1370  for (int m = 0; m < last_line_models.size(); m++) {
1371  const ParagraphModel *model = last_line_models[m];
1372  if (ValidBodyLine(rows_, i, model))
1373  row.AddBodyLine(model);
1374  }
1375  }
1376 
1377  // Step Four:
1378  // If we're still quite unsure about this line, go through all
1379  // models in our theory and see if this row could be the start
1380  // of any of our models.
1381  if (row.GetLineType() == LT_UNKNOWN ||
1382  (row.GetLineType() == LT_START && !row.UniqueStartHypothesis())) {
1383  SetOfModels all_models;
1384  theory_->NonCenteredModels(&all_models);
1385  for (int m = 0; m < all_models.size(); m++) {
1386  if (ValidFirstLine(rows_, i, all_models[m])) {
1387  row.AddStartLine(all_models[m]);
1388  }
1389  }
1390  }
1391  // Step Five:
1392  // Since we may have updated the hypotheses about this row, we need
1393  // to recalculate the Open models for the rest of rows[i + 1, row_end)
1394  if (row.GetLineType() != LT_UNKNOWN) {
1395  CalculateOpenModels(i + 1, row_end_);
1396  }
1397  }
1398 }
1399 
1400 // ================ Main Paragraph Detection Algorithm =======================
1401 
1402 // Find out what ParagraphModels are actually used, and discard any
1403 // that are not.
1405  ParagraphTheory *theory) {
1406  SetOfModels used_models;
1407  for (int i = 0; i < rows.size(); i++) {
1408  rows[i].StrongHypotheses(&used_models);
1409  }
1410  theory->DiscardUnusedModels(used_models);
1411 }
1412 
1413 // DowngradeWeakestToCrowns:
1414 // Forget any flush-{left, right} models unless we see two or more
1415 // of them in sequence.
1416 //
1417 // In pass 3, we start to classify even flush-left paragraphs (paragraphs
1418 // where the first line and body indent are the same) as having proper Models.
1419 // This is generally dangerous, since if you start imagining that flush-left
1420 // is a typical paragraph model when it is not, it will lead you to chop normal
1421 // indented paragraphs in the middle whenever a sentence happens to start on a
1422 // new line (see "This" above). What to do?
1423 // What we do is to take any paragraph which is flush left and is not
1424 // preceded by another paragraph of the same model and convert it to a "Crown"
1425 // paragraph. This is a weak pseudo-ParagraphModel which is a placeholder
1426 // for later. It means that the paragraph is flush, but it would be desirable
1427 // to mark it as the same model as following text if it fits. This downgrade
1428 // FlushLeft -> CrownLeft -> Model of following paragraph. Means that we
1429 // avoid making flush left Paragraph Models whenever we see a top-of-the-page
1430 // half-of-a-paragraph. and instead we mark it the same as normal body text.
1431 //
1432 // Implementation:
1433 //
1434 // Comb backwards through the row scratch registers, and turn any
1435 // sequences of body lines of equivalent type abutted against the beginning
1436 // or a body or start line of a different type into a crown paragraph.
1437 void DowngradeWeakestToCrowns(int debug_level,
1438  ParagraphTheory *theory,
1440  int start;
1441  for (int end = rows->size(); end > 0; end = start) {
1442  // Search back for a body line of a unique type.
1443  const ParagraphModel *model = NULL;
1444  while (end > 0 &&
1445  (model = (*rows)[end - 1].UniqueBodyHypothesis()) == NULL) {
1446  end--;
1447  }
1448  if (end == 0) break;
1449  start = end - 1;
1450  while (start >= 0 && (*rows)[start].UniqueBodyHypothesis() == model) {
1451  start--; // walk back to the first line that is not the same body type.
1452  }
1453  if (start >= 0 && (*rows)[start].UniqueStartHypothesis() == model &&
1454  StrongModel(model) &&
1455  NearlyEqual(model->first_indent(), model->body_indent(),
1456  model->tolerance())) {
1457  start--;
1458  }
1459  start++;
1460  // Now rows[start, end) is a sequence of unique body hypotheses of model.
1461  if (StrongModel(model) && model->justification() == JUSTIFICATION_CENTER)
1462  continue;
1463  if (!StrongModel(model)) {
1464  while (start > 0 &&
1465  CrownCompatible(rows, start - 1, start, model))
1466  start--;
1467  }
1468  if (start == 0 ||
1469  (!StrongModel(model)) ||
1470  (StrongModel(model) && !ValidFirstLine(rows, start - 1, model))) {
1471  // crownify rows[start, end)
1472  const ParagraphModel *crown_model = model;
1473  if (StrongModel(model)) {
1474  if (model->justification() == JUSTIFICATION_LEFT)
1475  crown_model = kCrownLeft;
1476  else
1477  crown_model = kCrownRight;
1478  }
1479  (*rows)[start].SetUnknown();
1480  (*rows)[start].AddStartLine(crown_model);
1481  for (int row = start + 1; row < end; row++) {
1482  (*rows)[row].SetUnknown();
1483  (*rows)[row].AddBodyLine(crown_model);
1484  }
1485  }
1486  }
1487  DiscardUnusedModels(*rows, theory);
1488 }
1489 
1490 
1491 // Clear all hypotheses about lines [start, end) and reset margins.
1492 //
1493 // The empty space between the left of a row and the block boundary (and
1494 // similarly for the right) is split into two pieces: margin and indent.
1495 // In initial processing, we assume the block is tight and the margin for
1496 // all lines is set to zero. However, if our first pass does not yield
1497 // models for everything, it may be due to an inset paragraph like a
1498 // block-quote. In that case, we make a second pass over that unmarked
1499 // section of the page and reset the "margin" portion of the empty space
1500 // to the common amount of space at the ends of the lines under consid-
1501 // eration. This would be equivalent to percentile set to 0. However,
1502 // sometimes we have a single character sticking out in the right margin
1503 // of a text block (like the 'r' in 'for' on line 3 above), and we can
1504 // really just ignore it as an outlier. To express this, we allow the
1505 // user to specify the percentile (0..100) of indent values to use as
1506 // the common margin for each row in the run of rows[start, end).
1508  GenericVector<RowScratchRegisters> *rows, int start, int end,
1509  int percentile) {
1510  if (!AcceptableRowArgs(0, 0, __func__, rows, start, end))
1511  return;
1512 
1513  int lmin, lmax, rmin, rmax;
1514  lmin = lmax = (*rows)[start].lmargin_ + (*rows)[start].lindent_;
1515  rmin = rmax = (*rows)[start].rmargin_ + (*rows)[start].rindent_;
1516  for (int i = start; i < end; i++) {
1517  RowScratchRegisters &sr = (*rows)[i];
1518  sr.SetUnknown();
1519  if (sr.ri_->num_words == 0)
1520  continue;
1521  UpdateRange(sr.lmargin_ + sr.lindent_, &lmin, &lmax);
1522  UpdateRange(sr.rmargin_ + sr.rindent_, &rmin, &rmax);
1523  }
1524  STATS lefts(lmin, lmax + 1);
1525  STATS rights(rmin, rmax + 1);
1526  for (int i = start; i < end; i++) {
1527  RowScratchRegisters &sr = (*rows)[i];
1528  if (sr.ri_->num_words == 0)
1529  continue;
1530  lefts.add(sr.lmargin_ + sr.lindent_, 1);
1531  rights.add(sr.rmargin_ + sr.rindent_, 1);
1532  }
1533  int ignorable_left = lefts.ile(ClipToRange(percentile, 0, 100) / 100.0);
1534  int ignorable_right = rights.ile(ClipToRange(percentile, 0, 100) / 100.0);
1535  for (int i = start; i < end; i++) {
1536  RowScratchRegisters &sr = (*rows)[i];
1537  int ldelta = ignorable_left - sr.lmargin_;
1538  sr.lmargin_ += ldelta;
1539  sr.lindent_ -= ldelta;
1540  int rdelta = ignorable_right - sr.rmargin_;
1541  sr.rmargin_ += rdelta;
1542  sr.rindent_ -= rdelta;
1543  }
1544 }
1545 
1546 // Return the minimum inter-word space in rows[row_start, row_end).
1548  int row_start, int row_end) {
1549  if (row_end < row_start + 1) return 1;
1550  bool legit = false;
1551  int natural_space = rows[row_start].ri_->average_interword_space;
1552  for (int i = row_start; i < row_end; i++) {
1553  if (rows[i].ri_->num_words > 1) {
1554  if (!legit) {
1555  natural_space = rows[i].ri_->average_interword_space;
1556  legit = true;
1557  } else {
1558  if (rows[i].ri_->average_interword_space < natural_space)
1559  natural_space = rows[i].ri_->average_interword_space;
1560  }
1561  }
1562  }
1563  return natural_space;
1564 }
1565 
1566 // Return whether the first word on the after line can fit in the space at
1567 // the end of the before line (knowing which way the text is aligned and read).
1569  const RowScratchRegisters &after,
1570  tesseract::ParagraphJustification justification) {
1571  if (before.ri_->num_words == 0 || after.ri_->num_words == 0)
1572  return true;
1573 
1574  if (justification == JUSTIFICATION_UNKNOWN) {
1575  tprintf("Don't call FirstWordWouldHaveFit(r, s, JUSTIFICATION_UNKNOWN).\n");
1576  }
1577  int available_space;
1578  if (justification == JUSTIFICATION_CENTER) {
1579  available_space = before.lindent_ + before.rindent_;
1580  } else {
1581  available_space = before.OffsideIndent(justification);
1582  }
1583  available_space -= before.ri_->average_interword_space;
1584 
1585  if (before.ri_->ltr)
1586  return after.ri_->lword_box.width() < available_space;
1587  return after.ri_->rword_box.width() < available_space;
1588 }
1589 
1590 // Return whether the first word on the after line can fit in the space at
1591 // the end of the before line (not knowing which way the text goes) in a left
1592 // or right alignemnt.
1594  const RowScratchRegisters &after) {
1595  if (before.ri_->num_words == 0 || after.ri_->num_words == 0)
1596  return true;
1597 
1598  int available_space = before.lindent_;
1599  if (before.rindent_ > available_space)
1600  available_space = before.rindent_;
1601  available_space -= before.ri_->average_interword_space;
1602 
1603  if (before.ri_->ltr)
1604  return after.ri_->lword_box.width() < available_space;
1605  return after.ri_->rword_box.width() < available_space;
1606 }
1607 
1609  const RowScratchRegisters &after) {
1610  if (before.ri_->ltr) {
1611  return before.ri_->rword_likely_ends_idea &&
1613  } else {
1614  return before.ri_->lword_likely_ends_idea &&
1616  }
1617 }
1618 
1620  const RowScratchRegisters &after) {
1621  return before.ri_->num_words == 0 ||
1622  (FirstWordWouldHaveFit(before, after) &&
1623  TextSupportsBreak(before, after));
1624 }
1625 
1627  const RowScratchRegisters &after,
1629  return before.ri_->num_words == 0 ||
1630  (FirstWordWouldHaveFit(before, after, j) &&
1631  TextSupportsBreak(before, after));
1632 }
1633 
1634 // Examine rows[start, end) and try to determine what sort of ParagraphModel
1635 // would fit them as a single paragraph.
1636 // If we can't produce a unique model justification_ = JUSTIFICATION_UNKNOWN.
1637 // If the rows given could be a consistent start to a paragraph, set *consistent
1638 // true.
1641  int start, int end, int tolerance, bool *consistent) {
1642  int ltr_line_count = 0;
1643  for (int i = start; i < end; i++) {
1644  ltr_line_count += static_cast<int>((*rows)[i].ri_->ltr);
1645  }
1646  bool ltr = (ltr_line_count >= (end - start) / 2);
1647 
1648  *consistent = true;
1649  if (!AcceptableRowArgs(0, 2, __func__, rows, start, end))
1650  return ParagraphModel();
1651 
1652  // Ensure the caller only passed us a region with a common rmargin and
1653  // lmargin.
1654  int lmargin = (*rows)[start].lmargin_;
1655  int rmargin = (*rows)[start].rmargin_;
1656  int lmin, lmax, rmin, rmax, cmin, cmax;
1657  lmin = lmax = (*rows)[start + 1].lindent_;
1658  rmin = rmax = (*rows)[start + 1].rindent_;
1659  cmin = cmax = 0;
1660  for (int i = start + 1; i < end; i++) {
1661  if ((*rows)[i].lmargin_ != lmargin || (*rows)[i].rmargin_ != rmargin) {
1662  tprintf("Margins don't match! Software error.\n");
1663  *consistent = false;
1664  return ParagraphModel();
1665  }
1666  UpdateRange((*rows)[i].lindent_, &lmin, &lmax);
1667  UpdateRange((*rows)[i].rindent_, &rmin, &rmax);
1668  UpdateRange((*rows)[i].rindent_ - (*rows)[i].lindent_, &cmin, &cmax);
1669  }
1670  int ldiff = lmax - lmin;
1671  int rdiff = rmax - rmin;
1672  int cdiff = cmax - cmin;
1673  if (rdiff > tolerance && ldiff > tolerance) {
1674  if (cdiff < tolerance * 2) {
1675  if (end - start < 3)
1676  return ParagraphModel();
1677  return ParagraphModel(JUSTIFICATION_CENTER, 0, 0, 0, tolerance);
1678  }
1679  *consistent = false;
1680  return ParagraphModel();
1681  }
1682  if (end - start < 3) // Don't return a model for two line paras.
1683  return ParagraphModel();
1684 
1685  // These booleans keep us from saying something is aligned left when the body
1686  // left variance is too large.
1687  bool body_admits_left_alignment = ldiff < tolerance;
1688  bool body_admits_right_alignment = rdiff < tolerance;
1689 
1690  ParagraphModel left_model =
1691  ParagraphModel(JUSTIFICATION_LEFT, lmargin, (*rows)[start].lindent_,
1692  (lmin + lmax) / 2, tolerance);
1693  ParagraphModel right_model =
1694  ParagraphModel(JUSTIFICATION_RIGHT, rmargin, (*rows)[start].rindent_,
1695  (rmin + rmax) / 2, tolerance);
1696 
1697  // These booleans keep us from having an indent on the "wrong side" for the
1698  // first line.
1699  bool text_admits_left_alignment = ltr || left_model.is_flush();
1700  bool text_admits_right_alignment = !ltr || right_model.is_flush();
1701 
1702  // At least one of the edges is less than tolerance in variance.
1703  // If the other is obviously ragged, it can't be the one aligned to.
1704  // [Note the last line is included in this raggedness.]
1705  if (tolerance < rdiff) {
1706  if (body_admits_left_alignment && text_admits_left_alignment)
1707  return left_model;
1708  *consistent = false;
1709  return ParagraphModel();
1710  }
1711  if (tolerance < ldiff) {
1712  if (body_admits_right_alignment && text_admits_right_alignment)
1713  return right_model;
1714  *consistent = false;
1715  return ParagraphModel();
1716  }
1717 
1718  // At this point, we know the body text doesn't vary much on either side.
1719 
1720  // If the first line juts out oddly in one direction or the other,
1721  // that likely indicates the side aligned to.
1722  int first_left = (*rows)[start].lindent_;
1723  int first_right = (*rows)[start].rindent_;
1724 
1725  if (ltr && body_admits_left_alignment &&
1726  (first_left < lmin || first_left > lmax))
1727  return left_model;
1728  if (!ltr && body_admits_right_alignment &&
1729  (first_right < rmin || first_right > rmax))
1730  return right_model;
1731 
1732  *consistent = false;
1733  return ParagraphModel();
1734 }
1735 
1736 // Examine rows[start, end) and try to determine what sort of ParagraphModel
1737 // would fit them as a single paragraph. If nothing fits,
1738 // justification_ = JUSTIFICATION_UNKNOWN and print the paragraph to debug
1739 // output if we're debugging.
1741  int debug_level,
1743  int start, int end, int tolerance) {
1744  bool unused_consistent;
1746  rows, start, end, tolerance, &unused_consistent);
1747  if (debug_level >= 2 && retval.justification() == JUSTIFICATION_UNKNOWN) {
1748  tprintf("Could not determine a model for this paragraph:\n");
1749  PrintRowRange(*rows, start, end);
1750  }
1751  return retval;
1752 }
1753 
1754 // Do rows[start, end) form a single instance of the given paragraph model?
1756  int start, int end, const ParagraphModel *model) {
1757  if (!AcceptableRowArgs(0, 1, __func__, rows, start, end))
1758  return false;
1759  if (!ValidFirstLine(rows, start, model)) return false;
1760  for (int i = start + 1 ; i < end; i++) {
1761  if (!ValidBodyLine(rows, i, model)) return false;
1762  }
1763  return true;
1764 }
1765 
1766 // Examine rows[row_start, row_end) as an independent section of text,
1767 // and mark rows that are exceptionally clear as start-of-paragraph
1768 // and paragraph-body lines.
1769 //
1770 // We presume that any lines surrounding rows[row_start, row_end) may
1771 // have wildly different paragraph models, so we don't key any data off
1772 // of those lines.
1773 //
1774 // We only take the very strongest signals, as we don't want to get
1775 // confused and marking up centered text, poetry, or source code as
1776 // clearly part of a typical paragraph.
1778  int row_start, int row_end) {
1779  // Record patently obvious body text.
1780  for (int i = row_start + 1; i < row_end; i++) {
1781  const RowScratchRegisters &prev = (*rows)[i - 1];
1782  RowScratchRegisters &curr = (*rows)[i];
1783  tesseract::ParagraphJustification typical_justification =
1785  if (!curr.ri_->rword_likely_starts_idea &&
1786  !curr.ri_->lword_likely_starts_idea &&
1787  !FirstWordWouldHaveFit(prev, curr, typical_justification)) {
1788  curr.SetBodyLine();
1789  }
1790  }
1791 
1792  // Record patently obvious start paragraph lines.
1793  //
1794  // It's an extremely good signal of the start of a paragraph that
1795  // the first word would have fit on the end of the previous line.
1796  // However, applying just that signal would have us mark random
1797  // start lines of lineated text (poetry and source code) and some
1798  // centered headings as paragraph start lines. Therefore, we use
1799  // a second qualification for a paragraph start: Not only should
1800  // the first word of this line have fit on the previous line,
1801  // but also, this line should go full to the right of the block,
1802  // disallowing a subsequent word from having fit on this line.
1803 
1804  // First row:
1805  {
1806  RowScratchRegisters &curr = (*rows)[row_start];
1807  RowScratchRegisters &next = (*rows)[row_start + 1];
1810  if (curr.GetLineType() == LT_UNKNOWN &&
1811  !FirstWordWouldHaveFit(curr, next, j) &&
1812  (curr.ri_->lword_likely_starts_idea ||
1813  curr.ri_->rword_likely_starts_idea)) {
1814  curr.SetStartLine();
1815  }
1816  }
1817  // Middle rows
1818  for (int i = row_start + 1; i < row_end - 1; i++) {
1819  RowScratchRegisters &prev = (*rows)[i - 1];
1820  RowScratchRegisters &curr = (*rows)[i];
1821  RowScratchRegisters &next = (*rows)[i + 1];
1824  if (curr.GetLineType() == LT_UNKNOWN &&
1825  !FirstWordWouldHaveFit(curr, next, j) &&
1826  LikelyParagraphStart(prev, curr, j)) {
1827  curr.SetStartLine();
1828  }
1829  }
1830  // Last row
1831  { // the short circuit at the top means we have at least two lines.
1832  RowScratchRegisters &prev = (*rows)[row_end - 2];
1833  RowScratchRegisters &curr = (*rows)[row_end - 1];
1836  if (curr.GetLineType() == LT_UNKNOWN &&
1837  !FirstWordWouldHaveFit(curr, curr, j) &&
1838  LikelyParagraphStart(prev, curr, j)) {
1839  curr.SetStartLine();
1840  }
1841  }
1842 }
1843 
1844 // Look for sequences of a start line followed by some body lines in
1845 // rows[row_start, row_end) and create ParagraphModels for them if
1846 // they seem coherent.
1847 void ModelStrongEvidence(int debug_level,
1849  int row_start, int row_end,
1850  bool allow_flush_models,
1851  ParagraphTheory *theory) {
1852  if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
1853  return;
1854 
1855  int start = row_start;
1856  while (start < row_end) {
1857  while (start < row_end && (*rows)[start].GetLineType() != LT_START)
1858  start++;
1859  if (start >= row_end - 1)
1860  break;
1861 
1862  int tolerance = Epsilon((*rows)[start + 1].ri_->average_interword_space);
1863  int end = start;
1864  ParagraphModel last_model;
1865  bool next_consistent;
1866  do {
1867  ++end;
1868  // rows[row, end) was consistent.
1869  // If rows[row, end + 1) is not consistent,
1870  // just model rows[row, end)
1871  if (end < row_end - 1) {
1872  RowScratchRegisters &next = (*rows)[end];
1873  LineType lt = next.GetLineType();
1874  next_consistent = lt == LT_BODY ||
1875  (lt == LT_UNKNOWN &&
1876  !FirstWordWouldHaveFit((*rows)[end - 1], (*rows)[end]));
1877  } else {
1878  next_consistent = false;
1879  }
1880  if (next_consistent) {
1882  rows, start, end + 1, tolerance, &next_consistent);
1883  if (((*rows)[start].ri_->ltr &&
1884  last_model.justification() == JUSTIFICATION_LEFT &&
1885  next_model.justification() != JUSTIFICATION_LEFT) ||
1886  (!(*rows)[start].ri_->ltr &&
1887  last_model.justification() == JUSTIFICATION_RIGHT &&
1888  next_model.justification() != JUSTIFICATION_RIGHT)) {
1889  next_consistent = false;
1890  }
1891  last_model = next_model;
1892  } else {
1893  next_consistent = false;
1894  }
1895  } while (next_consistent && end < row_end);
1896  // At this point, rows[start, end) looked like it could have been a
1897  // single paragraph. If we can make a good ParagraphModel for it,
1898  // do so and mark this sequence with that model.
1899  if (end > start + 1) {
1900  // emit a new paragraph if we have more than one line.
1901  const ParagraphModel *model = NULL;
1903  debug_level, rows, start, end,
1904  Epsilon(InterwordSpace(*rows, start, end)));
1905  if (new_model.justification() == JUSTIFICATION_UNKNOWN) {
1906  // couldn't create a good model, oh well.
1907  } else if (new_model.is_flush()) {
1908  if (end == start + 2) {
1909  // It's very likely we just got two paragraph starts in a row.
1910  end = start + 1;
1911  } else if (start == row_start) {
1912  // Mark this as a Crown.
1913  if (new_model.justification() == JUSTIFICATION_LEFT) {
1914  model = kCrownLeft;
1915  } else {
1916  model = kCrownRight;
1917  }
1918  } else if (allow_flush_models) {
1919  model = theory->AddModel(new_model);
1920  }
1921  } else {
1922  model = theory->AddModel(new_model);
1923  }
1924  if (model) {
1925  (*rows)[start].AddStartLine(model);
1926  for (int i = start + 1; i < end; i++) {
1927  (*rows)[i].AddBodyLine(model);
1928  }
1929  }
1930  }
1931  start = end;
1932  }
1933 }
1934 
1935 // We examine rows[row_start, row_end) and do the following:
1936 // (1) Clear all existing hypotheses for the rows being considered.
1937 // (2) Mark up any rows as exceptionally likely to be paragraph starts
1938 // or paragraph body lines as such using both geometric and textual
1939 // clues.
1940 // (3) Form models for any sequence of start + continuation lines.
1941 // (4) Smear the paragraph models to cover surrounding text.
1942 void StrongEvidenceClassify(int debug_level,
1944  int row_start, int row_end,
1945  ParagraphTheory *theory) {
1946  if (!AcceptableRowArgs(debug_level, 2, __func__, rows, row_start, row_end))
1947  return;
1948 
1949  if (debug_level > 1) {
1950  tprintf("#############################################\n");
1951  tprintf("# StrongEvidenceClassify( rows[%d:%d) )\n", row_start, row_end);
1952  tprintf("#############################################\n");
1953  }
1954 
1955  RecomputeMarginsAndClearHypotheses(rows, row_start, row_end, 10);
1956  MarkStrongEvidence(rows, row_start, row_end);
1957 
1958  DebugDump(debug_level > 2, "Initial strong signals.", *theory, *rows);
1959 
1960  // Create paragraph models.
1961  ModelStrongEvidence(debug_level, rows, row_start, row_end, false, theory);
1962 
1963  DebugDump(debug_level > 2, "Unsmeared hypotheses.s.", *theory, *rows);
1964 
1965  // At this point, some rows are marked up as paragraphs with model numbers,
1966  // and some rows are marked up as either LT_START or LT_BODY. Now let's
1967  // smear any good paragraph hypotheses forward and backward.
1968  ParagraphModelSmearer smearer(rows, row_start, row_end, theory);
1969  smearer.Smear();
1970 }
1971 
1973  int row_start, int row_end,
1974  ParagraphTheory *theory) {
1975  for (int i = row_start + 1; i < row_end - 1; i++) {
1976  if ((*rows)[i - 1].ri_->has_leaders &&
1977  (*rows)[i].ri_->has_leaders &&
1978  (*rows)[i + 1].ri_->has_leaders) {
1979  const ParagraphModel *model = theory->AddModel(
1980  ParagraphModel(JUSTIFICATION_UNKNOWN, 0, 0, 0, 0));
1981  (*rows)[i].AddStartLine(model);
1982  }
1983  }
1984 }
1985 
1986 // Collect sequences of unique hypotheses in row registers and create proper
1987 // paragraphs for them, referencing the paragraphs in row_owners.
1989  int debug_level,
1991  GenericVector<PARA *> *row_owners,
1992  ParagraphTheory *theory) {
1993  int end = rows.size();
1994  int start;
1995  for (; end > 0; end = start) {
1996  start = end - 1;
1997  const ParagraphModel *model = NULL;
1998  // TODO(eger): Be smarter about dealing with multiple hypotheses.
1999  bool single_line_paragraph = false;
2000  SetOfModels models;
2001  rows[start].NonNullHypotheses(&models);
2002  if (models.size() > 0) {
2003  model = models[0];
2004  if (rows[start].GetLineType(model) != LT_BODY)
2005  single_line_paragraph = true;
2006  }
2007  if (model && !single_line_paragraph) {
2008  // walk back looking for more body lines and then a start line.
2009  while (--start > 0 && rows[start].GetLineType(model) == LT_BODY) {
2010  // do nothing
2011  }
2012  if (start < 0 || rows[start].GetLineType(model) != LT_START) {
2013  model = NULL;
2014  }
2015  }
2016  if (model == NULL) {
2017  continue;
2018  }
2019  // rows[start, end) should be a paragraph.
2020  PARA *p = new PARA();
2021  if (model == kCrownLeft || model == kCrownRight) {
2023  // Crown paragraph.
2024  // If we can find an existing ParagraphModel that fits, use it,
2025  // else create a new one.
2026  for (int row = end; row < rows.size(); row++) {
2027  if ((*row_owners)[row] &&
2028  (ValidBodyLine(&rows, start, (*row_owners)[row]->model) &&
2029  (start == 0 ||
2030  ValidFirstLine(&rows, start, (*row_owners)[row]->model)))) {
2031  model = (*row_owners)[row]->model;
2032  break;
2033  }
2034  }
2035  if (model == kCrownLeft) {
2036  // No subsequent model fits, so cons one up.
2037  model = theory->AddModel(ParagraphModel(
2038  JUSTIFICATION_LEFT, rows[start].lmargin_ + rows[start].lindent_,
2039  0, 0, Epsilon(rows[start].ri_->average_interword_space)));
2040  } else if (model == kCrownRight) {
2041  // No subsequent model fits, so cons one up.
2042  model = theory->AddModel(ParagraphModel(
2043  JUSTIFICATION_RIGHT, rows[start].rmargin_ + rows[start].rmargin_,
2044  0, 0, Epsilon(rows[start].ri_->average_interword_space)));
2045  }
2046  }
2047  rows[start].SetUnknown();
2048  rows[start].AddStartLine(model);
2049  for (int i = start + 1; i < end; i++) {
2050  rows[i].SetUnknown();
2051  rows[i].AddBodyLine(model);
2052  }
2053  p->model = model;
2054  p->has_drop_cap = rows[start].ri_->has_drop_cap;
2055  p->is_list_item =
2057  ? rows[start].ri_->rword_indicates_list_item
2058  : rows[start].ri_->lword_indicates_list_item;
2059  for (int row = start; row < end; row++) {
2060  if ((*row_owners)[row] != NULL) {
2061  tprintf("Memory leak! ConvertHypothesizeModelRunsToParagraphs() called "
2062  "more than once!\n");
2063  }
2064  (*row_owners)[row] = p;
2065  }
2066  }
2067 }
2068 
2069 struct Interval {
2070  Interval() : begin(0), end(0) {}
2071  Interval(int b, int e) : begin(b), end(e) {}
2072 
2073  int begin;
2074  int end;
2075 };
2076 
2077 // Return whether rows[row] appears to be stranded, meaning that the evidence
2078 // for this row is very weak due to context. For instance, two lines of source
2079 // code may happen to be indented at the same tab vector as body text starts,
2080 // leading us to think they are two start-of-paragraph lines. This is not
2081 // optimal. However, we also don't want to mark a sequence of short dialog
2082 // as "weak," so our heuristic is:
2083 // (1) If a line is surrounded by lines of unknown type, it's weak.
2084 // (2) If two lines in a row are start lines for a given paragraph type, but
2085 // after that the same paragraph type does not continue, they're weak.
2087  SetOfModels row_models;
2088  rows[row].StrongHypotheses(&row_models);
2089 
2090  for (int m = 0; m < row_models.size(); m++) {
2091  bool all_starts = rows[row].GetLineType();
2092  int run_length = 1;
2093  bool continues = true;
2094  for (int i = row - 1; i >= 0 && continues; i--) {
2095  SetOfModels models;
2096  rows[i].NonNullHypotheses(&models);
2097  switch (rows[i].GetLineType(row_models[m])) {
2098  case LT_START: run_length++; break;
2099  case LT_MULTIPLE: // explicit fall-through
2100  case LT_BODY: run_length++; all_starts = false; break;
2101  case LT_UNKNOWN: // explicit fall-through
2102  default: continues = false;
2103  }
2104  }
2105  continues = true;
2106  for (int i = row + 1; i < rows.size() && continues; i++) {
2107  SetOfModels models;
2108  rows[i].NonNullHypotheses(&models);
2109  switch (rows[i].GetLineType(row_models[m])) {
2110  case LT_START: run_length++; break;
2111  case LT_MULTIPLE: // explicit fall-through
2112  case LT_BODY: run_length++; all_starts = false; break;
2113  case LT_UNKNOWN: // explicit fall-through
2114  default: continues = false;
2115  }
2116  }
2117  if (run_length > 2 || (!all_starts && run_length > 1)) return false;
2118  }
2119  return true;
2120 }
2121 
2122 // Go through rows[row_start, row_end) and gather up sequences that need better
2123 // classification.
2124 // + Sequences of non-empty rows without hypotheses.
2125 // + Crown paragraphs not immediately followed by a strongly modeled line.
2126 // + Single line paragraphs surrounded by text that doesn't match the
2127 // model.
2129  GenericVector<Interval> *to_fix,
2130  int row_start, int row_end) {
2131  to_fix->clear();
2132  for (int i = row_start; i < row_end; i++) {
2133  bool needs_fixing = false;
2134 
2135  SetOfModels models;
2136  SetOfModels models_w_crowns;
2137  rows[i].StrongHypotheses(&models);
2138  rows[i].NonNullHypotheses(&models_w_crowns);
2139  if (models.empty() && models_w_crowns.size() > 0) {
2140  // Crown paragraph. Is it followed by a modeled line?
2141  for (int end = i + 1; end < rows.size(); end++) {
2142  SetOfModels end_models;
2143  SetOfModels strong_end_models;
2144  rows[end].NonNullHypotheses(&end_models);
2145  rows[end].StrongHypotheses(&strong_end_models);
2146  if (end_models.size() == 0) {
2147  needs_fixing = true;
2148  break;
2149  } else if (strong_end_models.size() > 0) {
2150  needs_fixing = false;
2151  break;
2152  }
2153  }
2154  } else if (models.empty() && rows[i].ri_->num_words > 0) {
2155  // No models at all.
2156  needs_fixing = true;
2157  }
2158 
2159  if (!needs_fixing && !models.empty()) {
2160  needs_fixing = RowIsStranded(rows, i);
2161  }
2162 
2163  if (needs_fixing) {
2164  if (!to_fix->empty() && to_fix->back().end == i - 1)
2165  to_fix->back().end = i;
2166  else
2167  to_fix->push_back(Interval(i, i));
2168  }
2169  }
2170  // Convert inclusive intervals to half-open intervals.
2171  for (int i = 0; i < to_fix->size(); i++) {
2172  (*to_fix)[i].end = (*to_fix)[i].end + 1;
2173  }
2174 }
2175 
2176 // Given a set of row_owners pointing to PARAs or NULL (no paragraph known),
2177 // normalize each row_owner to point to an actual PARA, and output the
2178 // paragraphs in order onto paragraphs.
2180  GenericVector<PARA *> *row_owners,
2181  PARA_LIST *paragraphs) {
2182  GenericVector<PARA *> &rows = *row_owners;
2183  paragraphs->clear();
2184  PARA_IT out(paragraphs);
2185  PARA *formerly_null = NULL;
2186  for (int i = 0; i < rows.size(); i++) {
2187  if (rows[i] == NULL) {
2188  if (i == 0 || rows[i - 1] != formerly_null) {
2189  rows[i] = formerly_null = new PARA();
2190  } else {
2191  rows[i] = formerly_null;
2192  continue;
2193  }
2194  } else if (i > 0 && rows[i - 1] == rows[i]) {
2195  continue;
2196  }
2197  out.add_after_then_move(rows[i]);
2198  }
2199 }
2200 
2201 // Main entry point for Paragraph Detection Algorithm.
2202 //
2203 // Given a set of equally spaced textlines (described by row_infos),
2204 // Split them into paragraphs.
2205 //
2206 // Output:
2207 // row_owners - one pointer for each row, to the paragraph it belongs to.
2208 // paragraphs - this is the actual list of PARA objects.
2209 // models - the list of paragraph models referenced by the PARA objects.
2210 // caller is responsible for deleting the models.
2211 void DetectParagraphs(int debug_level,
2212  GenericVector<RowInfo> *row_infos,
2213  GenericVector<PARA *> *row_owners,
2214  PARA_LIST *paragraphs,
2217  ParagraphTheory theory(models);
2218 
2219  // Initialize row_owners to be a bunch of NULL pointers.
2220  row_owners->init_to_size(row_infos->size(), NULL);
2221 
2222  // Set up row scratch registers for the main algorithm.
2223  rows.init_to_size(row_infos->size(), RowScratchRegisters());
2224  for (int i = 0; i < row_infos->size(); i++) {
2225  rows[i].Init((*row_infos)[i]);
2226  }
2227 
2228  // Pass 1:
2229  // Detect sequences of lines that all contain leader dots (.....)
2230  // These are likely Tables of Contents. If there are three text lines in
2231  // a row with leader dots, it's pretty safe to say the middle one should
2232  // be a paragraph of its own.
2233  SeparateSimpleLeaderLines(&rows, 0, rows.size(), &theory);
2234 
2235  DebugDump(debug_level > 1, "End of Pass 1", theory, rows);
2236 
2237  GenericVector<Interval> leftovers;
2238  LeftoverSegments(rows, &leftovers, 0, rows.size());
2239  for (int i = 0; i < leftovers.size(); i++) {
2240  // Pass 2a:
2241  // Find any strongly evidenced start-of-paragraph lines. If they're
2242  // followed by two lines that look like body lines, make a paragraph
2243  // model for that and see if that model applies throughout the text
2244  // (that is, "smear" it).
2245  StrongEvidenceClassify(debug_level, &rows,
2246  leftovers[i].begin, leftovers[i].end, &theory);
2247 
2248  // Pass 2b:
2249  // If we had any luck in pass 2a, we got part of the page and didn't
2250  // know how to classify a few runs of rows. Take the segments that
2251  // didn't find a model and reprocess them individually.
2252  GenericVector<Interval> leftovers2;
2253  LeftoverSegments(rows, &leftovers2, leftovers[i].begin, leftovers[i].end);
2254  bool pass2a_was_useful = leftovers2.size() > 1 ||
2255  (leftovers2.size() == 1 &&
2256  (leftovers2[0].begin != 0 || leftovers2[0].end != rows.size()));
2257  if (pass2a_was_useful) {
2258  for (int j = 0; j < leftovers2.size(); j++) {
2259  StrongEvidenceClassify(debug_level, &rows,
2260  leftovers2[j].begin, leftovers2[j].end,
2261  &theory);
2262  }
2263  }
2264  }
2265 
2266  DebugDump(debug_level > 1, "End of Pass 2", theory, rows);
2267 
2268  // Pass 3:
2269  // These are the dregs for which we didn't have enough strong textual
2270  // and geometric clues to form matching models for. Let's see if
2271  // the geometric clues are simple enough that we could just use those.
2272  LeftoverSegments(rows, &leftovers, 0, rows.size());
2273  for (int i = 0; i < leftovers.size(); i++) {
2274  GeometricClassify(debug_level, &rows,
2275  leftovers[i].begin, leftovers[i].end, &theory);
2276  }
2277  // Undo any flush models for which there's little evidence.
2278  DowngradeWeakestToCrowns(debug_level, &theory, &rows);
2279 
2280  DebugDump(debug_level > 1, "End of Pass 3", theory, rows);
2281 
2282  // Pass 4:
2283  // Take everything that's still not marked up well and clear all markings.
2284  LeftoverSegments(rows, &leftovers, 0, rows.size());
2285  for (int i = 0; i < leftovers.size(); i++) {
2286  for (int j = leftovers[i].begin; j < leftovers[i].end; j++) {
2287  rows[j].SetUnknown();
2288  }
2289  }
2290 
2291  DebugDump(debug_level > 1, "End of Pass 4", theory, rows);
2292 
2293  // Convert all of the unique hypothesis runs to PARAs.
2294  ConvertHypothesizedModelRunsToParagraphs(debug_level, rows, row_owners,
2295  &theory);
2296 
2297  DebugDump(debug_level > 0, "Final Paragraph Segmentation", theory, rows);
2298 
2299  // Finally, clean up any dangling NULL row paragraph parents.
2300  CanonicalizeDetectionResults(row_owners, paragraphs);
2301 }
2302 
2303 // ============ Code interfacing with the rest of Tesseract ==================
2304 
2306  RowInfo *info) {
2307  // Set up text, lword_text, and rword_text (mostly for debug printing).
2308  STRING fake_text;
2309  PageIterator pit(static_cast<const PageIterator&>(it));
2310  bool first_word = true;
2311  if (!pit.Empty(RIL_WORD)) {
2312  do {
2313  fake_text += "x";
2314  if (first_word) info->lword_text += "x";
2315  info->rword_text += "x";
2316  if (pit.IsAtFinalElement(RIL_WORD, RIL_SYMBOL) &&
2318  fake_text += " ";
2319  info->rword_text = "";
2320  first_word = false;
2321  }
2322  } while (!pit.IsAtFinalElement(RIL_TEXTLINE, RIL_SYMBOL) &&
2323  pit.Next(RIL_SYMBOL));
2324  }
2325  if (fake_text.size() == 0) return;
2326 
2327  int lspaces = info->pix_ldistance / info->average_interword_space;
2328  for (int i = 0; i < lspaces; i++) {
2329  info->text += ' ';
2330  }
2331  info->text += fake_text;
2332 
2333  // Set up lword_box, rword_box, and num_words.
2334  PAGE_RES_IT page_res_it = *it.PageResIt();
2335  WERD_RES *word_res = page_res_it.restart_row();
2336  ROW_RES *this_row = page_res_it.row();
2337 
2338  WERD_RES *lword = NULL;
2339  WERD_RES *rword = NULL;
2340  info->num_words = 0;
2341  do {
2342  if (word_res) {
2343  if (!lword) lword = word_res;
2344  if (rword != word_res) info->num_words++;
2345  rword = word_res;
2346  }
2347  word_res = page_res_it.forward();
2348  } while (page_res_it.row() == this_row);
2349  info->lword_box = lword->word->bounding_box();
2350  info->rword_box = rword->word->bounding_box();
2351 }
2352 
2353 
2354 // Given a Tesseract Iterator pointing to a text line, fill in the paragraph
2355 // detector RowInfo with all relevant information from the row.
2356 void InitializeRowInfo(bool after_recognition,
2357  const MutableIterator &it,
2358  RowInfo *info) {
2359  if (it.PageResIt()->row() != NULL) {
2360  ROW *row = it.PageResIt()->row()->row;
2361  info->pix_ldistance = row->lmargin();
2362  info->pix_rdistance = row->rmargin();
2363  info->average_interword_space =
2364  row->space() > 0 ? row->space() : MAX(row->x_height(), 1);
2365  info->pix_xheight = row->x_height();
2366  info->has_leaders = false;
2367  info->has_drop_cap = row->has_drop_cap();
2368  info->ltr = true; // set below depending on word scripts
2369  } else {
2370  info->pix_ldistance = info->pix_rdistance = 0;
2371  info->average_interword_space = 1;
2372  info->pix_xheight = 1.0;
2373  info->has_leaders = false;
2374  info->has_drop_cap = false;
2375  info->ltr = true;
2376  }
2377 
2378  info->num_words = 0;
2379  info->lword_indicates_list_item = false;
2380  info->lword_likely_starts_idea = false;
2381  info->lword_likely_ends_idea = false;
2382  info->rword_indicates_list_item = false;
2383  info->rword_likely_starts_idea = false;
2384  info->rword_likely_ends_idea = false;
2385  info->has_leaders = false;
2386  info->ltr = 1;
2387 
2388  if (!after_recognition) {
2390  return;
2391  }
2392  info->text = "";
2393  char *text = it.GetUTF8Text(RIL_TEXTLINE);
2394  int trailing_ws_idx = strlen(text); // strip trailing space
2395  while (trailing_ws_idx > 0 &&
2396  // isspace() only takes ASCII
2397  ((text[trailing_ws_idx - 1] & 0x80) == 0) &&
2398  isspace(text[trailing_ws_idx - 1]))
2399  trailing_ws_idx--;
2400  if (trailing_ws_idx > 0) {
2401  int lspaces = info->pix_ldistance / info->average_interword_space;
2402  for (int i = 0; i < lspaces; i++)
2403  info->text += ' ';
2404  for (int i = 0; i < trailing_ws_idx; i++)
2405  info->text += text[i];
2406  }
2407  delete []text;
2408 
2409  if (info->text.size() == 0) {
2410  return;
2411  }
2412 
2413  PAGE_RES_IT page_res_it = *it.PageResIt();
2415  WERD_RES *word_res = page_res_it.restart_row();
2416  ROW_RES *this_row = page_res_it.row();
2417  int num_leaders = 0;
2418  int ltr = 0;
2419  int rtl = 0;
2420  do {
2421  if (word_res && word_res->best_choice->unichar_string().length() > 0) {
2422  werds.push_back(word_res);
2423  ltr += word_res->AnyLtrCharsInWord() ? 1 : 0;
2424  rtl += word_res->AnyRtlCharsInWord() ? 1 : 0;
2425  if (word_res->word->flag(W_REP_CHAR)) num_leaders++;
2426  }
2427  word_res = page_res_it.forward();
2428  } while (page_res_it.row() == this_row);
2429  info->ltr = ltr >= rtl;
2430  info->has_leaders = num_leaders > 3;
2431  info->num_words = werds.size();
2432  if (werds.size() > 0) {
2433  WERD_RES *lword = werds[0], *rword = werds[werds.size() - 1];
2434  info->lword_text = lword->best_choice->unichar_string().string();
2435  info->rword_text = rword->best_choice->unichar_string().string();
2436  info->lword_box = lword->word->bounding_box();
2437  info->rword_box = rword->word->bounding_box();
2438  LeftWordAttributes(lword->uch_set, lword->best_choice,
2439  info->lword_text,
2441  &info->lword_likely_starts_idea,
2442  &info->lword_likely_ends_idea);
2443  RightWordAttributes(rword->uch_set, rword->best_choice,
2444  info->rword_text,
2446  &info->rword_likely_starts_idea,
2447  &info->rword_likely_ends_idea);
2448  }
2449 }
2450 
2451 // This is called after rows have been identified and words are recognized.
2452 // Much of this could be implemented before word recognition, but text helps
2453 // to identify bulleted lists and gives good signals for sentence boundaries.
2454 void DetectParagraphs(int debug_level,
2455  bool after_text_recognition,
2456  const MutableIterator *block_start,
2458  // Clear out any preconceived notions.
2459  if (block_start->Empty(RIL_TEXTLINE)) {
2460  return;
2461  }
2462  BLOCK *block = block_start->PageResIt()->block()->block;
2463  block->para_list()->clear();
2464  bool is_image_block = block->poly_block() && !block->poly_block()->IsText();
2465 
2466  // Convert the Tesseract structures to RowInfos
2467  // for the paragraph detection algorithm.
2468  MutableIterator row(*block_start);
2469  if (row.Empty(RIL_TEXTLINE))
2470  return; // end of input already.
2471 
2472  GenericVector<RowInfo> row_infos;
2473  do {
2474  if (!row.PageResIt()->row())
2475  continue; // empty row.
2476  row.PageResIt()->row()->row->set_para(NULL);
2477  row_infos.push_back(RowInfo());
2478  RowInfo &ri = row_infos.back();
2479  InitializeRowInfo(after_text_recognition, row, &ri);
2480  } while (!row.IsAtFinalElement(RIL_BLOCK, RIL_TEXTLINE) &&
2481  row.Next(RIL_TEXTLINE));
2482 
2483  // If we're called before text recognition, we might not have
2484  // tight block bounding boxes, so trim by the minimum on each side.
2485  if (row_infos.size() > 0) {
2486  int min_lmargin = row_infos[0].pix_ldistance;
2487  int min_rmargin = row_infos[0].pix_rdistance;
2488  for (int i = 1; i < row_infos.size(); i++) {
2489  if (row_infos[i].pix_ldistance < min_lmargin)
2490  min_lmargin = row_infos[i].pix_ldistance;
2491  if (row_infos[i].pix_rdistance < min_rmargin)
2492  min_rmargin = row_infos[i].pix_rdistance;
2493  }
2494  if (min_lmargin > 0 || min_rmargin > 0) {
2495  for (int i = 0; i < row_infos.size(); i++) {
2496  row_infos[i].pix_ldistance -= min_lmargin;
2497  row_infos[i].pix_rdistance -= min_rmargin;
2498  }
2499  }
2500  }
2501 
2502  // Run the paragraph detection algorithm.
2503  GenericVector<PARA *> row_owners;
2504  GenericVector<PARA *> the_paragraphs;
2505  if (!is_image_block) {
2506  DetectParagraphs(debug_level, &row_infos, &row_owners, block->para_list(),
2507  models);
2508  } else {
2509  row_owners.init_to_size(row_infos.size(), NULL);
2510  CanonicalizeDetectionResults(&row_owners, block->para_list());
2511  }
2512 
2513  // Now stitch in the row_owners into the rows.
2514  row = *block_start;
2515  for (int i = 0; i < row_owners.size(); i++) {
2516  while (!row.PageResIt()->row())
2517  row.Next(RIL_TEXTLINE);
2518  row.PageResIt()->row()->row->set_para(row_owners[i]);
2519  row.Next(RIL_TEXTLINE);
2520  }
2521 }
2522 
2523 } // namespace