Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
combine_tessdata.cpp
Go to the documentation of this file.
1 
2 // File: combine_tessdata
3 // Description: Creates a unified traineddata file from several
4 // data files produced by the training process.
5 // Author: Daria Antonova
6 // Created: Wed Jun 03 11:26:43 PST 2009
7 //
8 // (C) Copyright 2009, Google Inc.
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 // http://www.apache.org/licenses/LICENSE-2.0
13 // Unless required by applicable law or agreed to in writing, software
14 // distributed under the License is distributed on an "AS IS" BASIS,
15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 // See the License for the specific language governing permissions and
17 // limitations under the License.
18 //
20 
21 #include "tessdatamanager.h"
22 
23 // Main program to combine/extract/overwrite tessdata components
24 // in [lang].traineddata files.
25 //
26 // To combine all the individual tessdata components (unicharset, DAWGs,
27 // classifier templates, ambiguities, language configs) located at, say,
28 // /home/$USER/temp/eng.* run:
29 //
30 // combine_tessdata /home/$USER/temp/eng.
31 //
32 // The result will be a combined tessdata file /home/$USER/temp/eng.traineddata
33 //
34 // Specify option -e if you would like to extract individual components
35 // from a combined traineddata file. For example, to extract language config
36 // file and the unicharset from tessdata/eng.traineddata run:
37 //
38 // combine_tessdata -e tessdata/eng.traineddata
39 // /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset
40 //
41 // The desired config file and unicharset will be written to
42 // /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset
43 //
44 // Specify option -o to overwrite individual components of the given
45 // [lang].traineddata file. For example, to overwrite language config
46 // and unichar ambiguities files in tessdata/eng.traineddata use:
47 //
48 // combine_tessdata -o tessdata/eng.traineddata
49 // /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharambigs
50 //
51 // As a result, tessdata/eng.traineddata will contain the new language config
52 // and unichar ambigs, plus all the original DAWGs, classifier teamples, etc.
53 //
54 // Note: the file names of the files to extract to and to overwrite from should
55 // have the appropriate file suffixes (extensions) indicating their tessdata
56 // component type (.unicharset for the unicharset, .unicharambigs for unichar
57 // ambigs, etc). See k*FileSuffix variable in ccutil/tessdatamanager.h.
58 //
59 // Specify option -u to unpack all the components to the specified path:
60 //
61 // combine_tessdata -u tessdata/eng.traineddata /home/$USER/temp/eng.
62 //
63 // This will create /home/$USER/temp/eng.* files with individual tessdata
64 // components from tessdata/eng.traineddata.
65 //
66 int main(int argc, char **argv) {
67  int i;
68  if (argc == 2) {
69  printf("Combining tessdata files\n");
70  STRING output_file = argv[1];
71  output_file += kTrainedDataSuffix;
73  argv[1], output_file.string())) {
74  char* last = &argv[1][strlen(argv[1])-1];
75  printf("Error combining tessdata files into %s\n",
76  output_file.string());
77  if (*last != '.')
78  printf("Hint: the prefix is missing a period (.)\n");
79  }
80  } else if (argc >= 4 && (strcmp(argv[1], "-e") == 0 ||
81  strcmp(argv[1], "-u") == 0)) {
82  // Initialize TessdataManager with the data in the given traineddata file.
84  tm.Init(argv[2], 0);
85  printf("Extracting tessdata components from %s\n", argv[2]);
86  if (strcmp(argv[1], "-e") == 0) {
87  for (i = 3; i < argc; ++i) {
88  if (tm.ExtractToFile(argv[i])) {
89  printf("Wrote %s\n", argv[i]);
90  } else {
91  printf("Not extracting %s, since this component"
92  " is not present\n", argv[i]);
93  }
94  }
95  } else { // extract all the components
96  for (i = 0; i < tesseract::TESSDATA_NUM_ENTRIES; ++i) {
97  STRING filename = argv[3];
98  filename += tesseract::kTessdataFileSuffixes[i];
99  if (tm.ExtractToFile(filename.string())) {
100  printf("Wrote %s\n", filename.string());
101  }
102  }
103  }
104  tm.End();
105  } else if (argc >= 4 && strcmp(argv[1], "-o") == 0) {
106  // Rename the current traineddata file to a temporary name.
107  const char *new_traineddata_filename = argv[2];
108  STRING traineddata_filename = new_traineddata_filename;
109  traineddata_filename += ".__tmp__";
110  if (rename(new_traineddata_filename, traineddata_filename.string()) != 0) {
111  tprintf("Failed to create a temporary file %s\n",
112  traineddata_filename.string());
113  exit(1);
114  }
115 
116  // Initialize TessdataManager with the data in the given traineddata file.
118  tm.Init(traineddata_filename.string(), 0);
119 
120  // Write the updated traineddata file.
121  tm.OverwriteComponents(new_traineddata_filename, argv+3, argc-3);
122  tm.End();
123  } else {
124  printf("Usage for combining tessdata components:\n"
125  "%s language_data_path_prefix (e.g. tessdata/eng.)\n", argv[0]);
126  printf("Usage for extracting tessdata components:\n"
127  "%s -e traineddata_file [output_component_file...]\n", argv[0]);
128  printf("Usage for overwriting tessdata components:\n"
129  "%s -o traineddata_file [input_component_file...]\n", argv[0]);
130  printf("Usage for unpacking all tessdata components:\n"
131  "%s -u traineddata_file output_path_prefix"
132  " (e.g. /tmp/eng.)\n", argv[0]);
133  return 1;
134  }
135 }