Tesseract
3.02
Main Page
Related Pages
Modules
Namespaces
Classes
Files
File List
File Members
All
Classes
Namespaces
Files
Functions
Variables
Typedefs
Enumerations
Enumerator
Friends
Macros
Groups
Pages
trie.h
Go to the documentation of this file.
1
/* -*-C-*-
2
********************************************************************************
3
*
4
* File: trie.h (Formerly trie.h)
5
* Description: Functions to build a trie data structure.
6
* Author: Mark Seaman, SW Productivity
7
* Created: Fri Oct 16 14:37:00 1987
8
* Modified: Fri Jul 26 11:26:34 1991 (Mark Seaman) marks@hpgrlt
9
* Language: C
10
* Package: N/A
11
* Status: Reusable Software Component
12
*
13
* (c) Copyright 1987, Hewlett-Packard Company.
14
** Licensed under the Apache License, Version 2.0 (the "License");
15
** you may not use this file except in compliance with the License.
16
** You may obtain a copy of the License at
17
** http://www.apache.org/licenses/LICENSE-2.0
18
** Unless required by applicable law or agreed to in writing, software
19
** distributed under the License is distributed on an "AS IS" BASIS,
20
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21
** See the License for the specific language governing permissions and
22
** limitations under the License.
23
*
24
*********************************************************************************/
25
#ifndef TRIE_H
26
#define TRIE_H
27
28
#include "
dawg.h
"
29
#include "
cutil.h
"
30
#include "
genericvector.h
"
31
32
class
UNICHARSET
;
33
34
// Note: if we consider either NODE_REF or EDGE_INDEX to ever exceed
35
// max int32, we will need to change GenericVector to use int64 for size
36
// and address indices. This does not seem to be needed immediately,
37
// since currently the largest number of edges limit used by tesseract
38
// (kMaxNumEdges in wordlist2dawg.cpp) is far less than max int32.
39
// There are also int casts below to satisfy the WIN32 compiler that would
40
// need to be changed.
41
// It might be cleanest to change the types of most of the Trie/Dawg related
42
// typedefs to int and restrict the casts to extracting these values from
43
// the 64 bit EDGE_RECORD.
44
typedef
inT64
EDGE_INDEX
;
// index of an edge in a given node
45
typedef
bool
*
NODE_MARKER
;
46
typedef
GenericVector<EDGE_RECORD>
EDGE_VECTOR
;
47
48
struct
TRIE_NODE_RECORD
{
49
EDGE_VECTOR
forward_edges
;
50
EDGE_VECTOR
backward_edges
;
51
};
52
typedef
GenericVector<TRIE_NODE_RECORD *>
TRIE_NODES
;
53
54
namespace
tesseract
{
55
62
class
Trie
:
public
Dawg
{
63
public
:
64
enum
RTLReversePolicy
{
65
RRP_DO_NO_REVERSE
,
66
RRP_REVERSE_IF_HAS_RTL
,
67
RRP_FORCE_REVERSE
,
68
};
69
70
// Minimum number of concrete characters at the beginning of user patterns.
71
static
const
int
kSaneNumConcreteChars
= 4;
72
// Various unicode whitespace characters are used to denote unichar patterns,
73
// (character classifier would never produce these whitespace characters as a
74
// valid classification).
75
static
const
char
kAlphaPatternUnicode
[];
76
static
const
char
kDigitPatternUnicode
[];
77
static
const
char
kAlphanumPatternUnicode
[];
78
static
const
char
kPuncPatternUnicode
[];
79
static
const
char
kLowerPatternUnicode
[];
80
static
const
char
kUpperPatternUnicode
[];
81
82
static
const
char
*
get_reverse_policy_name
(
83
RTLReversePolicy
reverse_policy);
84
85
// max_num_edges argument allows limiting the amount of memory this
86
// Trie can consume (if a new word insert would cause the Trie to
87
// contain more edges than max_num_edges, all the edges are cleared
88
// so that new inserts can proceed).
89
Trie
(
DawgType
type
,
const
STRING
&
lang
, PermuterType perm,
90
uinT64
max_num_edges,
int
unicharset_size,
int
debug_level) {
91
init
(type, lang, perm, unicharset_size, debug_level);
92
num_edges_
= 0;
93
max_num_edges_
= max_num_edges;
94
deref_node_index_mask_
= ~
letter_mask_
;
95
new_dawg_node
();
// need to allocate node 0
96
initialized_patterns_
=
false
;
97
}
98
virtual
~Trie
() {
nodes_
.
delete_data_pointers
(); }
99
100
// Reset the Trie to empty.
101
void
clear
();
102
104
EDGE_REF
edge_char_of
(
NODE_REF
node_ref,
UNICHAR_ID
unichar_id,
105
bool
word_end)
const
{
106
EDGE_RECORD
*edge_ptr;
107
EDGE_INDEX
edge_index;
108
if
(!
edge_char_of
(node_ref, NO_EDGE,
FORWARD_EDGE
, word_end, unichar_id,
109
&edge_ptr, &edge_index))
return
NO_EDGE;
110
return
make_edge_ref
(node_ref, edge_index);
111
}
112
117
void
unichar_ids_of
(
NODE_REF
node,
NodeChildVector
*vec)
const
{
118
const
EDGE_VECTOR
&forward_edges =
119
nodes_
[
static_cast<
int
>
(node)]->forward_edges;
120
for
(
int
i = 0; i < forward_edges.
size
(); ++i) {
121
vec->
push_back
(
NodeChild
(
unichar_id_from_edge_rec
(forward_edges[i]),
122
make_edge_ref
(node, i)));
123
}
124
}
125
130
NODE_REF
next_node
(
EDGE_REF
edge_ref)
const
{
131
if
(edge_ref == NO_EDGE ||
num_edges_
== 0)
return
NO_EDGE;
132
return
next_node_from_edge_rec
(*
deref_edge_ref
(edge_ref));
133
}
134
139
bool
end_of_word
(
EDGE_REF
edge_ref)
const
{
140
if
(edge_ref == NO_EDGE ||
num_edges_
== 0)
return
false
;
141
return
end_of_word_from_edge_rec
(*
deref_edge_ref
(edge_ref));
142
}
143
145
UNICHAR_ID
edge_letter
(
EDGE_REF
edge_ref)
const
{
146
if
(edge_ref == NO_EDGE ||
num_edges_
== 0)
return
INVALID_UNICHAR_ID;
147
return
unichar_id_from_edge_rec
(*
deref_edge_ref
(edge_ref));
148
}
149
150
// Prints the contents of the node indicated by the given NODE_REF.
151
// At most max_num_edges will be printed.
152
void
print_node
(
NODE_REF
node,
int
max_num_edges)
const
;
153
154
// Writes edges from nodes_ to an EDGE_ARRAY and creates a SquishedDawg.
155
// Eliminates redundant edges and returns the pointer to the SquishedDawg.
156
// Note: the caller is responsible for deallocating memory associated
157
// with the returned SquishedDawg pointer.
158
SquishedDawg
*
trie_to_dawg
();
159
160
// Inserts the list of words from the given file into the Trie.
161
// If reverse is true, calls WERD_CHOICE::reverse_unichar_ids_if_rtl()
162
// on each word before inserting it into the Trie.
163
bool
read_word_list
(
const
char
*
filename
,
164
const
UNICHARSET
&unicharset,
165
Trie::RTLReversePolicy
reverse
);
166
167
// Inserts the list of patterns from the given file into the Trie.
168
// The pattern list file should contain one pattern per line in UTF-8 format.
169
//
170
// Each pattern can contain any non-whitespace characters, however only the
171
// patterns that contain characters from the unicharset of the corresponding
172
// language will be useful.
173
// The only meta character is '\'. To be used in a pattern as an ordinary
174
// string it should be escaped with '\' (e.g. string "C:\Documents" should
175
// be written in the patterns file as "C:\\Documents").
176
// This function supports a very limited regular expression syntax. One can
177
// express a character, a certain character class and a number of times the
178
// entity should be repeated in the pattern.
179
//
180
// To denote a character class use one of:
181
// \c - unichar for which UNICHARSET::get_isalpha() is true (character)
182
// \d - unichar for which UNICHARSET::get_isdigit() is true
183
// \n - unichar for which UNICHARSET::get_isdigit() and
184
// UNICHARSET::isalpha() are true
185
// \p - unichar for which UNICHARSET::get_ispunct() is true
186
// \a - unichar for which UNICHARSET::get_islower() is true
187
// \A - unichar for which UNICHARSET::get_isupper() is true
188
//
189
// \* could be specified after each character or pattern to indicate that
190
// the character/pattern can be repeated any number of times before the next
191
// character/pattern occurs.
192
//
193
// Examples:
194
// 1-8\d\d-GOOG-411 will be expanded to strings:
195
// 1-800-GOOG-411, 1-801-GOOG-411, ... 1-899-GOOG-411.
196
//
197
// http://www.\n\*.com will be expanded to strings like:
198
// http://www.a.com http://www.a123.com ... http://www.ABCDefgHIJKLMNop.com
199
//
200
// Note: In choosing which patterns to include please be aware of the fact
201
// providing very generic patterns will make tesseract run slower.
202
// For example \n\* at the beginning of the pattern will make Tesseract
203
// consider all the combinations of proposed character choices for each
204
// of the segmentations, which will be unacceptably slow.
205
// Because of potential problems with speed that could be difficult to
206
// identify, each user pattern has to have at least kSaneNumConcreteChars
207
// concrete characters from the unicharset at the beginning.
208
bool
read_pattern_list
(
const
char
*
filename
,
const
UNICHARSET
&unicharset);
209
210
// Initializes the values of *_pattern_ unichar ids.
211
// This function should be called before calling read_pattern_list().
212
void
initialize_patterns
(
UNICHARSET
*unicharset);
213
214
// Fills in the given unichar id vector with the unichar ids that represent
215
// the patterns of the character classes of the given unichar_id.
216
void
unichar_id_to_patterns
(
UNICHAR_ID
unichar_id,
217
const
UNICHARSET
&unicharset,
218
GenericVector<UNICHAR_ID>
*vec)
const
;
219
220
// Returns the given EDGE_REF if the EDGE_RECORD that it points to has
221
// a self loop and the given unichar_id matches the unichar_id stored in the
222
// EDGE_RECORD, returns NO_EDGE otherwise.
223
virtual
EDGE_REF
pattern_loop_edge
(
EDGE_REF
edge_ref,
224
UNICHAR_ID
unichar_id,
225
bool
word_end)
const
{
226
if
(edge_ref == NO_EDGE)
return
NO_EDGE;
227
EDGE_RECORD
*edge_rec =
deref_edge_ref
(edge_ref);
228
return
(
marker_flag_from_edge_rec
(*edge_rec) &&
229
unichar_id ==
unichar_id_from_edge_rec
(*edge_rec) &&
230
word_end ==
end_of_word_from_edge_rec
(*edge_rec)) ?
231
edge_ref : NO_EDGE;
232
}
233
234
// Adds a word to the Trie (creates the necessary nodes and edges).
235
//
236
// If repetitions vector is not NULL, each entry in the vector indicates
237
// whether the unichar id with the corresponding index in the word is allowed
238
// to repeat an unlimited number of times. For each entry that is true, MARKER
239
// flag of the corresponding edge created for this unichar id is set to true).
240
//
241
// Return true if add succeeded, false otherwise (e.g. when a word contained
242
// an invalid unichar id or the trie was getting too large and was cleared).
243
bool
add_word_to_dawg
(
const
WERD_CHOICE
&word,
244
const
GenericVector<bool>
*repetitions);
245
bool
add_word_to_dawg
(
const
WERD_CHOICE
&word) {
246
return
add_word_to_dawg
(word,
NULL
);
247
}
248
249
protected
:
250
// The structure of an EDGE_REF for Trie edges is as follows:
251
// [LETTER_START_BIT, flag_start_bit_):
252
// edge index in *_edges in a TRIE_NODE_RECORD
253
// [flag_start_bit, 30th bit]: node index in nodes (TRIE_NODES vector)
254
//
255
// With this arrangement there are enough bits to represent edge indices
256
// (each node can have at most unicharset_size_ forward edges and
257
// the position of flag_start_bit is set to be log2(unicharset_size_)).
258
// It is also possible to accommodate a maximum number of nodes that is at
259
// least as large as that of the SquishedDawg representation (in SquishedDawg
260
// each EDGE_RECORD has 32-(flag_start_bit+NUM_FLAG_BITS) bits to represent
261
// the next node index).
262
//
263
264
// Returns the pointer to EDGE_RECORD after decoding the location
265
// of the edge from the information in the given EDGE_REF.
266
// This function assumes that EDGE_REF holds valid node/edge indices.
267
inline
EDGE_RECORD
*
deref_edge_ref
(
EDGE_REF
edge_ref)
const
{
268
int
edge_index =
static_cast<
int
>
(
269
(edge_ref &
letter_mask_
) >>
LETTER_START_BIT
);
270
int
node_index =
static_cast<
int
>
(
271
(edge_ref &
deref_node_index_mask_
) >>
flag_start_bit_
);
272
TRIE_NODE_RECORD
*node_rec =
nodes_
[node_index];
273
return
&(node_rec->
forward_edges
[edge_index]);
274
}
276
inline
EDGE_REF
make_edge_ref
(
NODE_REF
node_index,
277
EDGE_INDEX
edge_index)
const
{
278
return
((node_index <<
flag_start_bit_
) |
279
(edge_index <<
LETTER_START_BIT
));
280
}
282
inline
void
link_edge
(
EDGE_RECORD
*edge,
NODE_REF
nxt,
bool
repeats,
283
int
direction
,
bool
word_end,
UNICHAR_ID
unichar_id) {
284
EDGE_RECORD
flags = 0;
285
if
(repeats) flags |=
MARKER_FLAG
;
286
if
(word_end) flags |=
WERD_END_FLAG
;
287
if
(direction ==
BACKWARD_EDGE
) flags |=
DIRECTION_FLAG
;
288
*edge = ((nxt <<
next_node_start_bit_
) |
289
(static_cast<EDGE_RECORD>(flags) <<
flag_start_bit_
) |
290
(static_cast<EDGE_RECORD>(unichar_id) <<
LETTER_START_BIT
));
291
}
293
inline
void
print_edge_rec
(
const
EDGE_RECORD
&edge_rec)
const
{
294
tprintf
(
"|"
REFFORMAT
"|%s%s%s|%d|"
,
next_node_from_edge_rec
(edge_rec),
295
marker_flag_from_edge_rec
(edge_rec) ?
"R,"
:
""
,
296
(
direction_from_edge_rec
(edge_rec) ==
FORWARD_EDGE
) ?
"F"
:
"B"
,
297
end_of_word_from_edge_rec
(edge_rec) ?
",E"
:
""
,
298
unichar_id_from_edge_rec
(edge_rec));
299
}
300
// Returns true if the next node in recorded the given EDGE_RECORD
301
// has exactly one forward edge.
302
inline
bool
can_be_eliminated
(
const
EDGE_RECORD
&edge_rec) {
303
NODE_REF
node_ref =
next_node_from_edge_rec
(edge_rec);
304
return
(node_ref != NO_EDGE &&
305
nodes_
[static_cast<int>(node_ref)]->forward_edges.
size
() == 1);
306
}
307
308
// Prints the contents of the Trie.
309
// At most max_num_edges will be printed for each node.
310
void
print_all
(
const
char
* msg,
int
max_num_edges) {
311
tprintf
(
"\n__________________________\n%s\n"
, msg);
312
for
(
int
i = 0; i <
nodes_
.
size
(); ++i)
print_node
(i, max_num_edges);
313
tprintf
(
"__________________________\n"
);
314
}
315
316
// Finds the edge with the given direction, word_end and unichar_id
317
// in the node indicated by node_ref. Fills in the pointer to the
318
// EDGE_RECORD and the index of the edge with the the values
319
// corresponding to the edge found. Returns true if an edge was found.
320
bool
edge_char_of
(
NODE_REF
node_ref,
NODE_REF
next_node
,
321
int
direction
,
bool
word_end,
UNICHAR_ID
unichar_id,
322
EDGE_RECORD
**edge_ptr,
EDGE_INDEX
*edge_index)
const
;
323
324
// Adds an single edge linkage between node1 and node2 in the direction
325
// indicated by direction argument.
326
bool
add_edge_linkage
(
NODE_REF
node1,
NODE_REF
node2,
bool
repeats,
327
int
direction
,
bool
word_end,
328
UNICHAR_ID
unichar_id);
329
330
// Adds forward edge linkage from node1 to node2 and the corresponding
331
// backward edge linkage in the other direction.
332
bool
add_new_edge
(
NODE_REF
node1,
NODE_REF
node2,
333
bool
repeats,
bool
word_end,
UNICHAR_ID
unichar_id) {
334
return
(
add_edge_linkage
(node1, node2, repeats,
FORWARD_EDGE
,
335
word_end, unichar_id) &&
336
add_edge_linkage
(node2, node1, repeats,
BACKWARD_EDGE
,
337
word_end, unichar_id));
338
}
339
340
// Sets the word ending flags in an already existing edge pair.
341
// Returns true on success.
342
void
add_word_ending
(
EDGE_RECORD
*edge,
343
NODE_REF
the_next_node,
344
bool
repeats,
345
UNICHAR_ID
unichar_id);
346
347
// Allocates space for a new node in the Trie.
348
NODE_REF
new_dawg_node
();
349
350
// Removes a single edge linkage to between node1 and node2 in the
351
// direction indicated by direction argument.
352
void
remove_edge_linkage
(
NODE_REF
node1,
NODE_REF
node2,
int
direction
,
353
bool
word_end,
UNICHAR_ID
unichar_id);
354
355
// Removes forward edge linkage from node1 to node2 and the corresponding
356
// backward edge linkage in the other direction.
357
void
remove_edge
(
NODE_REF
node1,
NODE_REF
node2,
358
bool
word_end,
UNICHAR_ID
unichar_id) {
359
remove_edge_linkage
(node1, node2,
FORWARD_EDGE
, word_end, unichar_id);
360
remove_edge_linkage
(node2, node1,
BACKWARD_EDGE
, word_end, unichar_id);
361
}
362
363
// Compares edge1 and edge2 in the given node to see if they point to two
364
// next nodes that could be collapsed. If they do, performs the reduction
365
// and returns true.
366
bool
eliminate_redundant_edges
(
NODE_REF
node,
const
EDGE_RECORD
&edge1,
367
const
EDGE_RECORD
&edge2);
368
369
// Assuming that edge_index indicates the first edge in a group of edges
370
// in this node with a particular letter value, looks through these edges
371
// to see if any of them can be collapsed. If so does it. Returns to the
372
// caller when all edges with this letter have been reduced.
373
// Returns true if further reduction is possible with this same letter.
374
bool
reduce_lettered_edges
(
EDGE_INDEX
edge_index,
375
UNICHAR_ID
unichar_id,
376
NODE_REF
node,
377
const
EDGE_VECTOR
&backward_edges,
378
NODE_MARKER
reduced_nodes);
379
386
void
sort_edges
(
EDGE_VECTOR
*edges);
387
389
void
reduce_node_input
(
NODE_REF
node,
NODE_MARKER
reduced_nodes);
390
391
// Returns the pattern unichar id for the given character class code.
392
UNICHAR_ID
character_class_to_pattern
(
char
ch);
393
394
// Member variables
395
TRIE_NODES
nodes_
;
// vector of nodes in the Trie
396
uinT64
num_edges_
;
// sum of all edges (forward and backward)
397
uinT64
max_num_edges_
;
// maximum number of edges allowed
398
uinT64
deref_direction_mask_
;
// mask for EDGE_REF to extract direction
399
uinT64
deref_node_index_mask_
;
// mask for EDGE_REF to extract node index
400
// Variables for translating character class codes denoted in user patterns
401
// file to the unichar ids used to represent them in a Trie.
402
bool
initialized_patterns_
;
403
UNICHAR_ID
alpha_pattern_
;
404
UNICHAR_ID
digit_pattern_
;
405
UNICHAR_ID
alphanum_pattern_
;
406
UNICHAR_ID
punc_pattern_
;
407
UNICHAR_ID
lower_pattern_
;
408
UNICHAR_ID
upper_pattern_
;
409
};
410
}
// namespace tesseract
411
412
#endif
mnt
data
src
tesseract-ocr
dict
trie.h
Generated on Thu Nov 1 2012 20:19:48 for Tesseract by
1.8.1