Tesseract
3.02
Main Page
Related Pages
Modules
Namespaces
Classes
Files
File List
File Members
All
Classes
Namespaces
Files
Functions
Variables
Typedefs
Enumerations
Enumerator
Friends
Macros
Groups
Pages
shapetable.h
Go to the documentation of this file.
1
// Copyright 2010 Google Inc. All Rights Reserved.
2
// Author: rays@google.com (Ray Smith)
4
// File: shapetable.h
5
// Description: Class to map a classifier shape index to unicharset
6
// indices and font indices.
7
// Author: Ray Smith
8
// Created: Thu Oct 28 17:46:32 PDT 2010
9
//
10
// (C) Copyright 2010, Google Inc.
11
// Licensed under the Apache License, Version 2.0 (the "License");
12
// you may not use this file except in compliance with the License.
13
// You may obtain a copy of the License at
14
// http://www.apache.org/licenses/LICENSE-2.0
15
// Unless required by applicable law or agreed to in writing, software
16
// distributed under the License is distributed on an "AS IS" BASIS,
17
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
18
// See the License for the specific language governing permissions and
19
// limitations under the License.
20
//
22
23
#ifndef TESSERACT_CLASSIFY_SHAPETABLE_H_
24
#define TESSERACT_CLASSIFY_SHAPETABLE_H_
25
26
#include "
genericvector.h
"
27
#include "
intmatcher.h
"
28
29
class
STRING
;
30
class
UNICHARSET
;
31
32
namespace
tesseract
{
33
34
// Simple struct to hold a set of fonts associated with a single unichar-id.
35
// A vector of UnicharAndFonts makes a shape.
36
struct
UnicharAndFonts
{
37
UnicharAndFonts
() :
unichar_id
(0) {
38
}
39
UnicharAndFonts
(
int
uni_id,
int
font_id) :
unichar_id
(uni_id) {
40
font_ids
.
push_back
(font_id);
41
}
42
43
// Writes to the given file. Returns false in case of error.
44
bool
Serialize
(FILE* fp)
const
;
45
// Reads from the given file. Returns false in case of error.
46
// If swap is true, assumes a big/little-endian swap is needed.
47
bool
DeSerialize
(
bool
swap, FILE* fp);
48
49
// Sort function to sort a pair of UnicharAndFonts by unichar_id.
50
static
int
SortByUnicharId
(
const
void
* v1,
const
void
* v2);
51
52
GenericVector<inT32>
font_ids
;
53
inT32
unichar_id
;
54
};
55
56
// A Shape is a collection of unichar-ids and a list of fonts associated with
57
// each, organized as a vector of UnicharAndFonts. Conceptually a Shape is
58
// a classifiable unit, and represents a group of characters or parts of
59
// characters that have a similar or identical shape. Shapes/ShapeTables may
60
// be organized hierarchically from identical shapes at the leaves to vaguely
61
// similar shapes near the root.
62
class
Shape
{
63
public
:
64
Shape
() : destination_index_(-1) {}
65
66
// Writes to the given file. Returns false in case of error.
67
bool
Serialize
(FILE* fp)
const
;
68
// Reads from the given file. Returns false in case of error.
69
// If swap is true, assumes a big/little-endian swap is needed.
70
bool
DeSerialize
(
bool
swap, FILE* fp);
71
72
int
destination_index
()
const
{
73
return
destination_index_;
74
}
75
void
set_destination_index
(
int
index) {
76
destination_index_ = index;
77
}
78
int
size
()
const
{
79
return
unichars_.
size
();
80
}
81
// Returns a UnicharAndFonts entry for the given index, which must be
82
// in the range [0, size()).
83
const
UnicharAndFonts
&
operator[]
(
int
index)
const
{
84
return
unichars_[index];
85
}
86
// Adds a font_id for the given unichar_id. If the unichar_id is not
87
// in the shape, it is added.
88
void
AddToShape
(
int
unichar_id,
int
font_id);
89
// Adds everything in other to this.
90
void
AddShape
(
const
Shape
& other);
91
// Returns true if the shape contains the given unichar_id, font_id pair.
92
bool
ContainsUnicharAndFont
(
int
unichar_id,
int
font_id)
const
;
93
// Returns true if the shape contains the given unichar_id, ignoring font.
94
bool
ContainsUnichar
(
int
unichar_id)
const
;
95
// Returns true if the shape contains the given font, ignoring unichar_id.
96
bool
ContainsFont
(
int
font_id)
const
;
97
// Returns true if this is a subset (including equal) of other.
98
bool
IsSubsetOf
(
const
Shape
& other)
const
;
99
// Returns true if the lists of unichar ids are the same in this and other,
100
// ignoring fonts.
101
// NOT const, as it will sort the unichars on demand.
102
bool
IsEqualUnichars
(
Shape
* other);
103
104
private
:
105
// Sorts the unichars_ vector by unichar.
106
void
SortUnichars();
107
108
// Flag indicates that the unichars are sorted, allowing faster set
109
// operations with another shape.
110
bool
unichars_sorted_;
111
// If this Shape is part of a ShapeTable the destiation_index_ is the index
112
// of some other shape in the ShapeTable with which this shape is merged.
113
int
destination_index_;
114
// Array of unichars, each with a set of fonts. Each unichar has at most
115
// one entry in the vector.
116
GenericVector<UnicharAndFonts>
unichars_;
117
};
118
119
// ShapeTable is a class to encapsulate the triple indirection that is
120
// used here.
121
// ShapeTable is a vector of shapes.
122
// Each shape is a vector of UnicharAndFonts representing the set of unichars
123
// that the shape represents.
124
// Each UnicharAndFonts also lists the fonts of the unichar_id that were
125
// mapped to the shape during training.
126
class
ShapeTable
{
127
public
:
128
ShapeTable
();
129
// The UNICHARSET reference supplied here, or in set_unicharset below must
130
// exist for the entire life of the ShapeTable. It is used only by DebugStr.
131
explicit
ShapeTable
(
const
UNICHARSET
&
unicharset
);
132
133
// Writes to the given file. Returns false in case of error.
134
bool
Serialize
(FILE* fp)
const
;
135
// Reads from the given file. Returns false in case of error.
136
// If swap is true, assumes a big/little-endian swap is needed.
137
bool
DeSerialize
(
bool
swap, FILE* fp);
138
139
// Accessors.
140
int
NumShapes
()
const
{
141
return
shape_table_.
size
();
142
}
143
const
UNICHARSET
&
unicharset
()
const
{
144
return
*unicharset_;
145
}
146
// Shapetable takes a pointer to the UNICHARSET, so it must persist for the
147
// entire life of the ShapeTable.
148
void
set_unicharset
(
const
UNICHARSET
& unicharset) {
149
unicharset_ = &
unicharset
;
150
}
151
// Returns a string listing the classes/fonts in a shape.
152
STRING
DebugStr
(
int
shape_id)
const
;
153
// Returns a debug string summarizing the table.
154
STRING
SummaryStr
()
const
;
155
156
// Adds a new shape starting with the given unichar_id and font_id.
157
// Returns the assigned index.
158
int
AddShape
(
int
unichar_id,
int
font_id);
159
// Adds a copy of the given shape.
160
// Returns the assigned index.
161
int
AddShape
(
const
Shape
& other);
162
// Removes the shape given by the shape index. All indices above are changed!
163
void
DeleteShape
(
int
shape_id);
164
// Adds a font_id to the given existing shape index for the given
165
// unichar_id. If the unichar_id is not in the shape, it is added.
166
void
AddToShape
(
int
shape_id,
int
unichar_id,
int
font_id);
167
// Adds the given shape to the existing shape with the given index.
168
void
AddShapeToShape
(
int
shape_id,
const
Shape
& other);
169
// Returns the id of the shape that contains the given unichar and font.
170
// If not found, returns -1.
171
// If font_id < 0, the font_id is ignored and the first shape that matches
172
// the unichar_id is returned.
173
int
FindShape
(
int
unichar_id,
int
font_id)
const
;
174
// Returns the first unichar_id and font_id in the given shape.
175
void
GetFirstUnicharAndFont
(
int
shape_id,
176
int
* unichar_id,
int
* font_id)
const
;
177
178
// Accessors for the Shape with the given shape_id.
179
const
Shape
&
GetShape
(
int
shape_id)
const
{
180
return
*shape_table_[shape_id];
181
}
182
Shape
*
MutableShape
(
int
shape_id) {
183
return
shape_table_[shape_id];
184
}
185
186
// Expands all the classes/fonts in the shape individually to build
187
// a ShapeTable.
188
int
BuildFromShape
(
const
Shape
& shape,
const
ShapeTable
& master_shapes);
189
190
// Returns true if the shapes are already merged.
191
bool
AlreadyMerged
(
int
shape_id1,
int
shape_id2)
const
;
192
// Returns true if any shape contains multiple unichars.
193
bool
AnyMultipleUnichars
()
const
;
194
// Returns the maximum number of unichars over all shapes.
195
int
MaxNumUnichars
()
const
;
196
// Merges shapes with a common unichar over the [start, end) interval.
197
// Assumes single unichar per shape.
198
void
ForceFontMerges
(
int
start,
int
end);
199
// Returns the number of unichars in the master shape.
200
int
MasterUnicharCount
(
int
shape_id)
const
;
201
// Returns the sum of the font counts in the master shape.
202
int
MasterFontCount
(
int
shape_id)
const
;
203
// Returns the number of unichars that would result from merging the shapes.
204
int
MergedUnicharCount
(
int
shape_id1,
int
shape_id2)
const
;
205
// Merges two shape_ids, leaving shape_id2 marked as merged.
206
void
MergeShapes
(
int
shape_id1,
int
shape_id2);
207
// Appends the master shapes from other to this.
208
// Used to create a clean ShapeTable from a merged one, or to create a
209
// copy of a ShapeTable.
210
void
AppendMasterShapes
(
const
ShapeTable
& other);
211
// Returns the number of master shapes remaining after merging.
212
int
NumMasterShapes
()
const
;
213
// Returns the destination of this shape, (if merged), taking into account
214
// the fact that the destination may itself have been merged.
215
// For a non-merged shape, returns the input shape_id.
216
int
MasterDestinationIndex
(
int
shape_id)
const
;
217
218
private
:
219
// Pointer to a provided unicharset used only by the Debugstr member.
220
const
UNICHARSET
* unicharset_;
221
// Vector of pointers to the Shapes in this ShapeTable.
222
PointerVector<Shape>
shape_table_;
223
};
224
225
}
// namespace tesseract.
226
227
#endif // TESSERACT_CLASSIFY_SHAPETABLE_H_
mnt
data
src
tesseract-ocr
classify
shapetable.h
Generated on Thu Nov 1 2012 20:19:47 for Tesseract by
1.8.1