Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
permute.h
Go to the documentation of this file.
1 /* -*-C-*-
2  ********************************************************************************
3  *
4  * File: permute.h (Formerly permute.h)
5  * Description: Permute choices together
6  * Author: Mark Seaman, OCR Technology
7  * Created: Fri Sep 22 14:05:51 1989
8  * Modified: Mon May 20 16:32:04 1991 (Mark Seaman) marks@hpgrlt
9  * Language: C
10  * Package: N/A
11  * Status: Experimental (Do Not Distribute)
12  *
13  * (c) Copyright 1989, Hewlett-Packard Company.
14  ** Licensed under the Apache License, Version 2.0 (the "License");
15  ** you may not use this file except in compliance with the License.
16  ** You may obtain a copy of the License at
17  ** http://www.apache.org/licenses/LICENSE-2.0
18  ** Unless required by applicable law or agreed to in writing, software
19  ** distributed under the License is distributed on an "AS IS" BASIS,
20  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21  ** See the License for the specific language governing permissions and
22  ** limitations under the License.
23  *
24  ********************************************************************************/
25 #ifndef PERMUTE_H
26 #define PERMUTE_H
27 
28 /*----------------------------------------------------------------------
29  I n c l u d e s
30 ----------------------------------------------------------------------*/
31 
32 #include "ratngs.h"
33 #include "params.h"
34 #include "unicharset.h"
35 
36 #define MAX_PERM_LENGTH 128
37 
38 /*----------------------------------------------------------------------
39  V a r i a b l e s
40 ----------------------------------------------------------------------*/
41 extern INT_VAR_H(fragments_debug, 0, "Debug character fragments");
42 extern INT_VAR_H(segment_debug, 0, "Debug the whole segmentation process");
43 extern BOOL_VAR_H(permute_debug, 0, "char permutation debug");
44 
45 extern BOOL_VAR_H(permute_script_word, 0,
46  "Turn on word script consistency permuter");
47 
48 extern BOOL_VAR_H(permute_fixed_length_dawg, 0,
49  "Turn on fixed-length phrasebook search permuter");
50 
51 extern BOOL_VAR_H(segment_segcost_rating, 0,
52  "incorporate segmentation cost in word rating?");
53 
54 extern double_VAR_H(segment_reward_script, 0.95,
55  "Score multipler for script consistency within a word. "
56  "Being a 'reward' factor, it should be <= 1. "
57  "Smaller value implies bigger reward.");
58 
59 extern BOOL_VAR_H(permute_chartype_word, 0,
60  "Turn on character type (property) consistency permuter");
61 extern double_VAR_H(segment_reward_chartype, 0.97,
62  "Score multipler for char type consistency within a word. ");
63 
64 extern double_VAR_H(segment_reward_ngram_best_choice, 0.99,
65  "Score multipler for ngram permuter's best choice"
66  " (only used in the Han script path).");
67 
68 extern INT_VAR_H(max_permuter_attempts, 100000,
69  "Maximum number of different character choices to consider"
70  " during permutation. This limit is especially useful when"
71  " user patterns are specified, since overly generic patterns"
72  " can result in dawg search exploring an overly large number"
73  "of options.");
74 
75 extern int permute_only_top;
76 
77 /*----------------------------------------------------------------------
78  F u n c t i o n s
79 ----------------------------------------------------------------------*/
80 void adjust_non_word(const char *word, const char *word_lengths,
81  float rating, float *new_rating, float *adjust_factor);
82 
83 const char* choose_il1(const char *first_char, //first choice
84  const char *second_char, //second choice
85  const char *third_char, //third choice
86  const char *prev_char, //prev in word
87  const char *next_char, //next in word
88  const char *next_next_char);
89 
90 namespace tesseract {
91 
92 // This is an awkward solution to allow "compounding" of permuter effects.
93 // Right now, each permuter generates a WERD_CHOICE with some modified
94 // rating which is compared to the current best choice, and the winner
95 // is saved. Therefore, independent permuter improvements, eg. from script
96 // consistency, dictionary check, and punctuation promoting, override each
97 // other and can not be combined.
98 // We need a trellis and someway to modify the path cost. Instead, we
99 // approximate by saving a permutation string, which records the preferred
100 // char choice [0-9] at each position [0..#chunks], and a cumulative reward
101 // factor. Non-conflicting changes can be accumulated and the combined
102 // result will be returned.
103 // Default_bias is the initial value for the base multiplier. In other words,
104 // it is the multiplier for raw choice rating if nothing is modified.
105 // This would be 1.0 when used with reward-based permuters in CJK-path,
106 // but it could be > 1 (eg. segment_penalty_garbage) to be compatible with
107 // penalty-based permuters in the Latin path.
108 // Note this class does not handle fragmented characters. It does so by
109 // setting the preferred position of fragmented characters to '1' at Init,
110 // which effectively skips the fragment choice. However, it can still be
111 // overridden if collision is allowed. It is the responsibility of the
112 // permuters to avoid permuting fragmented characters.
114  public:
115  PermuterState();
116 
117  void Init(const BLOB_CHOICE_LIST_VECTOR& char_choices,
118  const UNICHARSET &unicharset,
119  float default_bias,
120  bool debug);
121 
122  void AddPreference(int start_pos, char* pos_str, float weight);
123 
124  void AddPreference(int char_pos, BLOB_CHOICE* blob_choice, float weight);
125 
126  WERD_CHOICE* GetPermutedWord(float *certainties, float *adjust_factor);
127 
128  void set_allow_collision(bool flag) { allow_collision_ = flag; }
129  void set_adjust_factor(float factor) { adjust_factor_ = factor; }
130  void set_debug(bool debug) { debug_ = debug; }
131  bool position_marked(int pos) { return perm_state_[pos] != kPosFree; }
132 
133  private:
134  static const char kPosFree = '.';
135 
136  const UNICHARSET *unicharset_;
137 
138  const BLOB_CHOICE_LIST_VECTOR *char_choices_; // reference pointer only
139  // does not need to be allocated or freed
140  char perm_state_[MAX_PERM_LENGTH]; // handles upto MAX_PERM_LENGTH-1 states
141  // stores preferred char choices, '0'..'9', or '.'
142  int word_length_; // the number of char positions in the word
143  bool allow_collision_; // can previously set preference to be overwritten?
144  float adjust_factor_; // multiplying factor for rating adjustment
145  bool debug_; // whether debug statements should be printed
146 };
147 
148 } // namespace tesseract
149 
150 #endif