Tesseract  3.02
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Groups Pages
cluster.cpp
Go to the documentation of this file.
1 /******************************************************************************
2  ** Filename: cluster.c
3  ** Purpose: Routines for clustering points in N-D space
4  ** Author: Dan Johnson
5  ** History: 5/29/89, DSJ, Created.
6  **
7  ** (c) Copyright Hewlett-Packard Company, 1988.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  ******************************************************************************/
18 #include "oldheap.h"
19 #include "const.h"
20 #include "cluster.h"
21 #include "emalloc.h"
22 #include "helpers.h"
23 #include "matrix.h"
24 #include "tprintf.h"
25 #include "danerror.h"
26 #include "freelist.h"
27 #include <math.h>
28 
29 #define HOTELLING 1 // If true use Hotelling's test to decide where to split.
30 #define FTABLE_X 10 // Size of FTable.
31 #define FTABLE_Y 100 // Size of FTable.
32 
33 // Table of values approximating the cumulative F-distribution for a confidence of 1%.
34 const double FTable[FTABLE_Y][FTABLE_X] = {
35  {4052.19, 4999.52, 5403.34, 5624.62, 5763.65, 5858.97, 5928.33, 5981.10, 6022.50, 6055.85,},
36  {98.502, 99.000, 99.166, 99.249, 99.300, 99.333, 99.356, 99.374, 99.388, 99.399,},
37  {34.116, 30.816, 29.457, 28.710, 28.237, 27.911, 27.672, 27.489, 27.345, 27.229,},
38  {21.198, 18.000, 16.694, 15.977, 15.522, 15.207, 14.976, 14.799, 14.659, 14.546,},
39  {16.258, 13.274, 12.060, 11.392, 10.967, 10.672, 10.456, 10.289, 10.158, 10.051,},
40  {13.745, 10.925, 9.780, 9.148, 8.746, 8.466, 8.260, 8.102, 7.976, 7.874,},
41  {12.246, 9.547, 8.451, 7.847, 7.460, 7.191, 6.993, 6.840, 6.719, 6.620,},
42  {11.259, 8.649, 7.591, 7.006, 6.632, 6.371, 6.178, 6.029, 5.911, 5.814,},
43  {10.561, 8.022, 6.992, 6.422, 6.057, 5.802, 5.613, 5.467, 5.351, 5.257,},
44  {10.044, 7.559, 6.552, 5.994, 5.636, 5.386, 5.200, 5.057, 4.942, 4.849,},
45  { 9.646, 7.206, 6.217, 5.668, 5.316, 5.069, 4.886, 4.744, 4.632, 4.539,},
46  { 9.330, 6.927, 5.953, 5.412, 5.064, 4.821, 4.640, 4.499, 4.388, 4.296,},
47  { 9.074, 6.701, 5.739, 5.205, 4.862, 4.620, 4.441, 4.302, 4.191, 4.100,},
48  { 8.862, 6.515, 5.564, 5.035, 4.695, 4.456, 4.278, 4.140, 4.030, 3.939,},
49  { 8.683, 6.359, 5.417, 4.893, 4.556, 4.318, 4.142, 4.004, 3.895, 3.805,},
50  { 8.531, 6.226, 5.292, 4.773, 4.437, 4.202, 4.026, 3.890, 3.780, 3.691,},
51  { 8.400, 6.112, 5.185, 4.669, 4.336, 4.102, 3.927, 3.791, 3.682, 3.593,},
52  { 8.285, 6.013, 5.092, 4.579, 4.248, 4.015, 3.841, 3.705, 3.597, 3.508,},
53  { 8.185, 5.926, 5.010, 4.500, 4.171, 3.939, 3.765, 3.631, 3.523, 3.434,},
54  { 8.096, 5.849, 4.938, 4.431, 4.103, 3.871, 3.699, 3.564, 3.457, 3.368,},
55  { 8.017, 5.780, 4.874, 4.369, 4.042, 3.812, 3.640, 3.506, 3.398, 3.310,},
56  { 7.945, 5.719, 4.817, 4.313, 3.988, 3.758, 3.587, 3.453, 3.346, 3.258,},
57  { 7.881, 5.664, 4.765, 4.264, 3.939, 3.710, 3.539, 3.406, 3.299, 3.211,},
58  { 7.823, 5.614, 4.718, 4.218, 3.895, 3.667, 3.496, 3.363, 3.256, 3.168,},
59  { 7.770, 5.568, 4.675, 4.177, 3.855, 3.627, 3.457, 3.324, 3.217, 3.129,},
60  { 7.721, 5.526, 4.637, 4.140, 3.818, 3.591, 3.421, 3.288, 3.182, 3.094,},
61  { 7.677, 5.488, 4.601, 4.106, 3.785, 3.558, 3.388, 3.256, 3.149, 3.062,},
62  { 7.636, 5.453, 4.568, 4.074, 3.754, 3.528, 3.358, 3.226, 3.120, 3.032,},
63  { 7.598, 5.420, 4.538, 4.045, 3.725, 3.499, 3.330, 3.198, 3.092, 3.005,},
64  { 7.562, 5.390, 4.510, 4.018, 3.699, 3.473, 3.305, 3.173, 3.067, 2.979,},
65  { 7.530, 5.362, 4.484, 3.993, 3.675, 3.449, 3.281, 3.149, 3.043, 2.955,},
66  { 7.499, 5.336, 4.459, 3.969, 3.652, 3.427, 3.258, 3.127, 3.021, 2.934,},
67  { 7.471, 5.312, 4.437, 3.948, 3.630, 3.406, 3.238, 3.106, 3.000, 2.913,},
68  { 7.444, 5.289, 4.416, 3.927, 3.611, 3.386, 3.218, 3.087, 2.981, 2.894,},
69  { 7.419, 5.268, 4.396, 3.908, 3.592, 3.368, 3.200, 3.069, 2.963, 2.876,},
70  { 7.396, 5.248, 4.377, 3.890, 3.574, 3.351, 3.183, 3.052, 2.946, 2.859,},
71  { 7.373, 5.229, 4.360, 3.873, 3.558, 3.334, 3.167, 3.036, 2.930, 2.843,},
72  { 7.353, 5.211, 4.343, 3.858, 3.542, 3.319, 3.152, 3.021, 2.915, 2.828,},
73  { 7.333, 5.194, 4.327, 3.843, 3.528, 3.305, 3.137, 3.006, 2.901, 2.814,},
74  { 7.314, 5.179, 4.313, 3.828, 3.514, 3.291, 3.124, 2.993, 2.888, 2.801,},
75  { 7.296, 5.163, 4.299, 3.815, 3.501, 3.278, 3.111, 2.980, 2.875, 2.788,},
76  { 7.280, 5.149, 4.285, 3.802, 3.488, 3.266, 3.099, 2.968, 2.863, 2.776,},
77  { 7.264, 5.136, 4.273, 3.790, 3.476, 3.254, 3.087, 2.957, 2.851, 2.764,},
78  { 7.248, 5.123, 4.261, 3.778, 3.465, 3.243, 3.076, 2.946, 2.840, 2.754,},
79  { 7.234, 5.110, 4.249, 3.767, 3.454, 3.232, 3.066, 2.935, 2.830, 2.743,},
80  { 7.220, 5.099, 4.238, 3.757, 3.444, 3.222, 3.056, 2.925, 2.820, 2.733,},
81  { 7.207, 5.087, 4.228, 3.747, 3.434, 3.213, 3.046, 2.916, 2.811, 2.724,},
82  { 7.194, 5.077, 4.218, 3.737, 3.425, 3.204, 3.037, 2.907, 2.802, 2.715,},
83  { 7.182, 5.066, 4.208, 3.728, 3.416, 3.195, 3.028, 2.898, 2.793, 2.706,},
84  { 7.171, 5.057, 4.199, 3.720, 3.408, 3.186, 3.020, 2.890, 2.785, 2.698,},
85  { 7.159, 5.047, 4.191, 3.711, 3.400, 3.178, 3.012, 2.882, 2.777, 2.690,},
86  { 7.149, 5.038, 4.182, 3.703, 3.392, 3.171, 3.005, 2.874, 2.769, 2.683,},
87  { 7.139, 5.030, 4.174, 3.695, 3.384, 3.163, 2.997, 2.867, 2.762, 2.675,},
88  { 7.129, 5.021, 4.167, 3.688, 3.377, 3.156, 2.990, 2.860, 2.755, 2.668,},
89  { 7.119, 5.013, 4.159, 3.681, 3.370, 3.149, 2.983, 2.853, 2.748, 2.662,},
90  { 7.110, 5.006, 4.152, 3.674, 3.363, 3.143, 2.977, 2.847, 2.742, 2.655,},
91  { 7.102, 4.998, 4.145, 3.667, 3.357, 3.136, 2.971, 2.841, 2.736, 2.649,},
92  { 7.093, 4.991, 4.138, 3.661, 3.351, 3.130, 2.965, 2.835, 2.730, 2.643,},
93  { 7.085, 4.984, 4.132, 3.655, 3.345, 3.124, 2.959, 2.829, 2.724, 2.637,},
94  { 7.077, 4.977, 4.126, 3.649, 3.339, 3.119, 2.953, 2.823, 2.718, 2.632,},
95  { 7.070, 4.971, 4.120, 3.643, 3.333, 3.113, 2.948, 2.818, 2.713, 2.626,},
96  { 7.062, 4.965, 4.114, 3.638, 3.328, 3.108, 2.942, 2.813, 2.708, 2.621,},
97  { 7.055, 4.959, 4.109, 3.632, 3.323, 3.103, 2.937, 2.808, 2.703, 2.616,},
98  { 7.048, 4.953, 4.103, 3.627, 3.318, 3.098, 2.932, 2.803, 2.698, 2.611,},
99  { 7.042, 4.947, 4.098, 3.622, 3.313, 3.093, 2.928, 2.798, 2.693, 2.607,},
100  { 7.035, 4.942, 4.093, 3.618, 3.308, 3.088, 2.923, 2.793, 2.689, 2.602,},
101  { 7.029, 4.937, 4.088, 3.613, 3.304, 3.084, 2.919, 2.789, 2.684, 2.598,},
102  { 7.023, 4.932, 4.083, 3.608, 3.299, 3.080, 2.914, 2.785, 2.680, 2.593,},
103  { 7.017, 4.927, 4.079, 3.604, 3.295, 3.075, 2.910, 2.781, 2.676, 2.589,},
104  { 7.011, 4.922, 4.074, 3.600, 3.291, 3.071, 2.906, 2.777, 2.672, 2.585,},
105  { 7.006, 4.917, 4.070, 3.596, 3.287, 3.067, 2.902, 2.773, 2.668, 2.581,},
106  { 7.001, 4.913, 4.066, 3.591, 3.283, 3.063, 2.898, 2.769, 2.664, 2.578,},
107  { 6.995, 4.908, 4.062, 3.588, 3.279, 3.060, 2.895, 2.765, 2.660, 2.574,},
108  { 6.990, 4.904, 4.058, 3.584, 3.275, 3.056, 2.891, 2.762, 2.657, 2.570,},
109  { 6.985, 4.900, 4.054, 3.580, 3.272, 3.052, 2.887, 2.758, 2.653, 2.567,},
110  { 6.981, 4.896, 4.050, 3.577, 3.268, 3.049, 2.884, 2.755, 2.650, 2.563,},
111  { 6.976, 4.892, 4.047, 3.573, 3.265, 3.046, 2.881, 2.751, 2.647, 2.560,},
112  { 6.971, 4.888, 4.043, 3.570, 3.261, 3.042, 2.877, 2.748, 2.644, 2.557,},
113  { 6.967, 4.884, 4.040, 3.566, 3.258, 3.039, 2.874, 2.745, 2.640, 2.554,},
114  { 6.963, 4.881, 4.036, 3.563, 3.255, 3.036, 2.871, 2.742, 2.637, 2.551,},
115  { 6.958, 4.877, 4.033, 3.560, 3.252, 3.033, 2.868, 2.739, 2.634, 2.548,},
116  { 6.954, 4.874, 4.030, 3.557, 3.249, 3.030, 2.865, 2.736, 2.632, 2.545,},
117  { 6.950, 4.870, 4.027, 3.554, 3.246, 3.027, 2.863, 2.733, 2.629, 2.542,},
118  { 6.947, 4.867, 4.024, 3.551, 3.243, 3.025, 2.860, 2.731, 2.626, 2.539,},
119  { 6.943, 4.864, 4.021, 3.548, 3.240, 3.022, 2.857, 2.728, 2.623, 2.537,},
120  { 6.939, 4.861, 4.018, 3.545, 3.238, 3.019, 2.854, 2.725, 2.621, 2.534,},
121  { 6.935, 4.858, 4.015, 3.543, 3.235, 3.017, 2.852, 2.723, 2.618, 2.532,},
122  { 6.932, 4.855, 4.012, 3.540, 3.233, 3.014, 2.849, 2.720, 2.616, 2.529,},
123  { 6.928, 4.852, 4.010, 3.538, 3.230, 3.012, 2.847, 2.718, 2.613, 2.527,},
124  { 6.925, 4.849, 4.007, 3.535, 3.228, 3.009, 2.845, 2.715, 2.611, 2.524,},
125  { 6.922, 4.846, 4.004, 3.533, 3.225, 3.007, 2.842, 2.713, 2.609, 2.522,},
126  { 6.919, 4.844, 4.002, 3.530, 3.223, 3.004, 2.840, 2.711, 2.606, 2.520,},
127  { 6.915, 4.841, 3.999, 3.528, 3.221, 3.002, 2.838, 2.709, 2.604, 2.518,},
128  { 6.912, 4.838, 3.997, 3.525, 3.218, 3.000, 2.835, 2.706, 2.602, 2.515,},
129  { 6.909, 4.836, 3.995, 3.523, 3.216, 2.998, 2.833, 2.704, 2.600, 2.513,},
130  { 6.906, 4.833, 3.992, 3.521, 3.214, 2.996, 2.831, 2.702, 2.598, 2.511,},
131  { 6.904, 4.831, 3.990, 3.519, 3.212, 2.994, 2.829, 2.700, 2.596, 2.509,},
132  { 6.901, 4.829, 3.988, 3.517, 3.210, 2.992, 2.827, 2.698, 2.594, 2.507,},
133  { 6.898, 4.826, 3.986, 3.515, 3.208, 2.990, 2.825, 2.696, 2.592, 2.505,},
134  { 6.895, 4.824, 3.984, 3.513, 3.206, 2.988, 2.823, 2.694, 2.590, 2.503}
135 };
136 
137 /* define the variance which will be used as a minimum variance for any
138  dimension of any feature. Since most features are calculated from numbers
139  with a precision no better than 1 in 128, the variance should never be
140  less than the square of this number for parameters whose range is 1. */
141 #define MINVARIANCE 0.0004
142 
143 /* define the absolute minimum number of samples which must be present in
144  order to accurately test hypotheses about underlying probability
145  distributions. Define separately the minimum samples that are needed
146  before a statistical analysis is attempted; this number should be
147  equal to MINSAMPLES but can be set to a lower number for early testing
148  when very few samples are available. */
149 #define MINSAMPLESPERBUCKET 5
150 #define MINSAMPLES (MINBUCKETS * MINSAMPLESPERBUCKET)
151 #define MINSAMPLESNEEDED 1
152 
153 /* define the size of the table which maps normalized samples to
154  histogram buckets. Also define the number of standard deviations
155  in a normal distribution which are considered to be significant.
156  The mapping table will be defined in such a way that it covers
157  the specified number of standard deviations on either side of
158  the mean. BUCKETTABLESIZE should always be even. */
159 #define BUCKETTABLESIZE 1024
160 #define NORMALEXTENT 3.0
161 
162 struct TEMPCLUSTER {
165 };
166 
167 struct STATISTICS {
170  FLOAT32 *Min; // largest negative distance from the mean
171  FLOAT32 *Max; // largest positive distance from the mean
172 };
173 
174 struct BUCKETS {
175  DISTRIBUTION Distribution; // distribution being tested for
176  uinT32 SampleCount; // # of samples in histogram
177  FLOAT64 Confidence; // confidence level of test
178  FLOAT64 ChiSquared; // test threshold
179  uinT16 NumberOfBuckets; // number of cells in histogram
180  uinT16 Bucket[BUCKETTABLESIZE];// mapping to histogram buckets
181  uinT32 *Count; // frequency of occurence histogram
182  FLOAT32 *ExpectedCount; // expected histogram
183 };
184 
185 struct CHISTRUCT{
189 };
190 
191 // For use with KDWalk / MakePotentialClusters
193  HEAP *heap; // heap used to hold temp clusters, "best" on top
194  TEMPCLUSTER *candidates; // array of potential clusters
195  KDTREE *tree; // kd-tree to be searched for neighbors
196  inT32 next; // next candidate to be used
197 };
198 
199 typedef FLOAT64 (*DENSITYFUNC) (inT32);
200 typedef FLOAT64 (*SOLVEFUNC) (CHISTRUCT *, double);
201 
202 #define Odd(N) ((N)%2)
203 #define Mirror(N,R) ((R) - (N) - 1)
204 #define Abs(N) ( ( (N) < 0 ) ? ( -(N) ) : (N) )
205 
206 //--------------Global Data Definitions and Declarations----------------------
207 /* the following variables describe a discrete normal distribution
208  which is used by NormalDensity() and NormalBucket(). The
209  constant NORMALEXTENT determines how many standard
210  deviations of the distribution are mapped onto the fixed
211  discrete range of x. x=0 is mapped to -NORMALEXTENT standard
212  deviations and x=BUCKETTABLESIZE is mapped to
213  +NORMALEXTENT standard deviations. */
214 #define SqrtOf2Pi 2.506628275
215 static const FLOAT64 kNormalStdDev = BUCKETTABLESIZE / (2.0 * NORMALEXTENT);
216 static const FLOAT64 kNormalVariance =
218 static const FLOAT64 kNormalMagnitude =
219  (2.0 * NORMALEXTENT) / (SqrtOf2Pi * BUCKETTABLESIZE);
220 static const FLOAT64 kNormalMean = BUCKETTABLESIZE / 2;
221 
222 /* define lookup tables used to compute the number of histogram buckets
223  that should be used for a given number of samples. */
224 #define LOOKUPTABLESIZE 8
225 #define MAXDEGREESOFFREEDOM MAXBUCKETS
226 
227 static const uinT32 kCountTable[LOOKUPTABLESIZE] = {
228  MINSAMPLES, 200, 400, 600, 800, 1000, 1500, 2000
229 }; // number of samples
230 
231 static const uinT16 kBucketsTable[LOOKUPTABLESIZE] = {
232  MINBUCKETS, 16, 20, 24, 27, 30, 35, MAXBUCKETS
233 }; // number of buckets
234 
235 /*-------------------------------------------------------------------------
236  Private Function Prototypes
237 --------------------------------------------------------------------------*/
238 void CreateClusterTree(CLUSTERER *Clusterer);
239 
240 void MakePotentialClusters(ClusteringContext *context, CLUSTER *Cluster,
241  inT32 Level);
242 
244  CLUSTER *Cluster,
245  FLOAT32 *Distance);
246 
247 CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster);
248 
250 register PARAM_DESC ParamDesc[],
251 register inT32 n1,
252 register inT32 n2,
253 register FLOAT32 m[],
254 register FLOAT32 m1[], register FLOAT32 m2[]);
255 
257 
258 PROTOTYPE *MakePrototype(CLUSTERER *Clusterer,
260  CLUSTER *Cluster);
261 
263  CLUSTER *Cluster,
264  STATISTICS *Statistics,
265  PROTOSTYLE Style,
266  inT32 MinSamples);
267 
270  CLUSTER *Cluster,
271  STATISTICS *Statistics);
272 
274  CLUSTER *Cluster,
275  STATISTICS *Statistics,
276  BUCKETS *Buckets);
277 
279  CLUSTER *Cluster,
280  STATISTICS *Statistics,
281  BUCKETS *Buckets);
282 
284  CLUSTER *Cluster,
285  STATISTICS *Statistics,
286  BUCKETS *NormalBuckets,
287  FLOAT64 Confidence);
288 
289 void MakeDimRandom(uinT16 i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc);
290 
291 void MakeDimUniform(uinT16 i, PROTOTYPE *Proto, STATISTICS *Statistics);
292 
294 PARAM_DESC ParamDesc[], CLUSTER * Cluster);
295 
297  CLUSTER *Cluster,
298  STATISTICS *Statistics);
299 
301  CLUSTER *Cluster,
302  STATISTICS *Statistics);
303 
304 PROTOTYPE *NewMixedProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics);
305 
306 PROTOTYPE *NewSimpleProto(inT16 N, CLUSTER *Cluster);
307 
308 BOOL8 Independent (PARAM_DESC ParamDesc[],
309 inT16 N, FLOAT32 * CoVariance, FLOAT32 Independence);
310 
311 BUCKETS *GetBuckets(CLUSTERER* clusterer,
312  DISTRIBUTION Distribution,
313  uinT32 SampleCount,
314  FLOAT64 Confidence);
315 
316 BUCKETS *MakeBuckets(DISTRIBUTION Distribution,
317  uinT32 SampleCount,
318  FLOAT64 Confidence);
319 
321 
323 
325 
327 
329 
330 void FillBuckets(BUCKETS *Buckets,
331  CLUSTER *Cluster,
332  uinT16 Dim,
333  PARAM_DESC *ParamDesc,
334  FLOAT32 Mean,
335  FLOAT32 StdDev);
336 
337 uinT16 NormalBucket(PARAM_DESC *ParamDesc,
338  FLOAT32 x,
339  FLOAT32 Mean,
340  FLOAT32 StdDev);
341 
342 uinT16 UniformBucket(PARAM_DESC *ParamDesc,
343  FLOAT32 x,
344  FLOAT32 Mean,
345  FLOAT32 StdDev);
346 
347 BOOL8 DistributionOK(BUCKETS *Buckets);
348 
349 void FreeStatistics(STATISTICS *Statistics);
350 
351 void FreeBuckets(BUCKETS *Buckets);
352 
353 void FreeCluster(CLUSTER *Cluster);
354 
355 uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets);
356 
357 int NumBucketsMatch(void *arg1, // BUCKETS *Histogram,
358  void *arg2); // uinT16 *DesiredNumberOfBuckets);
359 
360 int ListEntryMatch(void *arg1, void *arg2);
361 
362 void AdjustBuckets(BUCKETS *Buckets, uinT32 NewSampleCount);
363 
364 void InitBuckets(BUCKETS *Buckets);
365 
366 int AlphaMatch(void *arg1, // CHISTRUCT *ChiStruct,
367  void *arg2); // CHISTRUCT *SearchKey);
368 
370 
371 FLOAT64 Solve(SOLVEFUNC Function,
372  void *FunctionParams,
373  FLOAT64 InitialGuess,
374  FLOAT64 Accuracy);
375 
376 FLOAT64 ChiArea(CHISTRUCT *ChiParams, FLOAT64 x);
377 
379  CLUSTER *Cluster,
380  FLOAT32 MaxIllegal);
381 
382 double InvertMatrix(const float* input, int size, float* inv);
383 
384 //--------------------------Public Code--------------------------------------
394 CLUSTERER *
395 MakeClusterer (inT16 SampleSize, const PARAM_DESC ParamDesc[]) {
396  CLUSTERER *Clusterer;
397  int i;
398 
399  // allocate main clusterer data structure and init simple fields
400  Clusterer = (CLUSTERER *) Emalloc (sizeof (CLUSTERER));
401  Clusterer->SampleSize = SampleSize;
402  Clusterer->NumberOfSamples = 0;
403  Clusterer->NumChar = 0;
404 
405  // init fields which will not be used initially
406  Clusterer->Root = NULL;
407  Clusterer->ProtoList = NIL_LIST;
408 
409  // maintain a copy of param descriptors in the clusterer data structure
410  Clusterer->ParamDesc =
411  (PARAM_DESC *) Emalloc (SampleSize * sizeof (PARAM_DESC));
412  for (i = 0; i < SampleSize; i++) {
413  Clusterer->ParamDesc[i].Circular = ParamDesc[i].Circular;
414  Clusterer->ParamDesc[i].NonEssential = ParamDesc[i].NonEssential;
415  Clusterer->ParamDesc[i].Min = ParamDesc[i].Min;
416  Clusterer->ParamDesc[i].Max = ParamDesc[i].Max;
417  Clusterer->ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min;
418  Clusterer->ParamDesc[i].HalfRange = Clusterer->ParamDesc[i].Range / 2;
419  Clusterer->ParamDesc[i].MidRange =
420  (ParamDesc[i].Max + ParamDesc[i].Min) / 2;
421  }
422 
423  // allocate a kd tree to hold the samples
424  Clusterer->KDTree = MakeKDTree (SampleSize, ParamDesc);
425 
426  // Initialize cache of histogram buckets to minimize recomputing them.
427  for (int d = 0; d < DISTRIBUTION_COUNT; ++d) {
428  for (int c = 0; c < MAXBUCKETS + 1 - MINBUCKETS; ++c)
429  Clusterer->bucket_cache[d][c] = NULL;
430  }
431 
432  return Clusterer;
433 } // MakeClusterer
434 
435 
450 SAMPLE* MakeSample(CLUSTERER * Clusterer, const FLOAT32* Feature,
451  inT32 CharID) {
452  SAMPLE *Sample;
453  int i;
454 
455  // see if the samples have already been clustered - if so trap an error
456  if (Clusterer->Root != NULL)
458  "Can't add samples after they have been clustered");
459 
460  // allocate the new sample and initialize it
461  Sample = (SAMPLE *) Emalloc (sizeof (SAMPLE) +
462  (Clusterer->SampleSize -
463  1) * sizeof (FLOAT32));
464  Sample->Clustered = FALSE;
465  Sample->Prototype = FALSE;
466  Sample->SampleCount = 1;
467  Sample->Left = NULL;
468  Sample->Right = NULL;
469  Sample->CharID = CharID;
470 
471  for (i = 0; i < Clusterer->SampleSize; i++)
472  Sample->Mean[i] = Feature[i];
473 
474  // add the sample to the KD tree - keep track of the total # of samples
475  Clusterer->NumberOfSamples++;
476  KDStore (Clusterer->KDTree, Sample->Mean, (char *) Sample);
477  if (CharID >= Clusterer->NumChar)
478  Clusterer->NumChar = CharID + 1;
479 
480  // execute hook for monitoring clustering operation
481  // (*SampleCreationHook)( Sample );
482 
483  return (Sample);
484 } // MakeSample
485 
486 
505  //only create cluster tree if samples have never been clustered before
506  if (Clusterer->Root == NULL)
507  CreateClusterTree(Clusterer);
508 
509  //deallocate the old prototype list if one exists
510  FreeProtoList (&Clusterer->ProtoList);
511  Clusterer->ProtoList = NIL_LIST;
512 
513  //compute prototypes starting at the root node in the tree
514  ComputePrototypes(Clusterer, Config);
515  return (Clusterer->ProtoList);
516 } // ClusterSamples
517 
518 
532 void FreeClusterer(CLUSTERER *Clusterer) {
533  if (Clusterer != NULL) {
534  memfree (Clusterer->ParamDesc);
535  if (Clusterer->KDTree != NULL)
536  FreeKDTree (Clusterer->KDTree);
537  if (Clusterer->Root != NULL)
538  FreeCluster (Clusterer->Root);
539  // Free up all used buckets structures.
540  for (int d = 0; d < DISTRIBUTION_COUNT; ++d) {
541  for (int c = 0; c < MAXBUCKETS + 1 - MINBUCKETS; ++c)
542  if (Clusterer->bucket_cache[d][c] != NULL)
543  FreeBuckets(Clusterer->bucket_cache[d][c]);
544  }
545 
546  memfree(Clusterer);
547  }
548 } // FreeClusterer
549 
550 
560 void FreeProtoList(LIST *ProtoList) {
561  destroy_nodes(*ProtoList, FreePrototype);
562 } // FreeProtoList
563 
564 
575 void FreePrototype(void *arg) { //PROTOTYPE *Prototype)
576  PROTOTYPE *Prototype = (PROTOTYPE *) arg;
577 
578  // unmark the corresponding cluster (if there is one
579  if (Prototype->Cluster != NULL)
580  Prototype->Cluster->Prototype = FALSE;
581 
582  // deallocate the prototype statistics and then the prototype itself
583  if (Prototype->Distrib != NULL)
584  memfree (Prototype->Distrib);
585  if (Prototype->Mean != NULL)
586  memfree (Prototype->Mean);
587  if (Prototype->Style != spherical) {
588  if (Prototype->Variance.Elliptical != NULL)
589  memfree (Prototype->Variance.Elliptical);
590  if (Prototype->Magnitude.Elliptical != NULL)
591  memfree (Prototype->Magnitude.Elliptical);
592  if (Prototype->Weight.Elliptical != NULL)
593  memfree (Prototype->Weight.Elliptical);
594  }
595  memfree(Prototype);
596 } // FreePrototype
597 
598 
614 CLUSTER *NextSample(LIST *SearchState) {
615  CLUSTER *Cluster;
616 
617  if (*SearchState == NIL_LIST)
618  return (NULL);
619  Cluster = (CLUSTER *) first_node (*SearchState);
620  *SearchState = pop (*SearchState);
621  while (TRUE) {
622  if (Cluster->Left == NULL)
623  return (Cluster);
624  *SearchState = push (*SearchState, Cluster->Right);
625  Cluster = Cluster->Left;
626  }
627 } // NextSample
628 
629 
639 FLOAT32 Mean(PROTOTYPE *Proto, uinT16 Dimension) {
640  return (Proto->Mean[Dimension]);
641 } // Mean
642 
643 
654  switch (Proto->Style) {
655  case spherical:
656  return ((FLOAT32) sqrt ((double) Proto->Variance.Spherical));
657  case elliptical:
658  return ((FLOAT32)
659  sqrt ((double) Proto->Variance.Elliptical[Dimension]));
660  case mixed:
661  switch (Proto->Distrib[Dimension]) {
662  case normal:
663  return ((FLOAT32)
664  sqrt ((double) Proto->Variance.Elliptical[Dimension]));
665  case uniform:
666  case D_random:
667  return (Proto->Variance.Elliptical[Dimension]);
668  case DISTRIBUTION_COUNT:
669  ASSERT_HOST(!"Distribution count not allowed!");
670  }
671  }
672  return 0.0f;
673 } // StandardDeviation
674 
675 
676 /*---------------------------------------------------------------------------
677  Private Code
678 ----------------------------------------------------------------------------*/
694 void CreateClusterTree(CLUSTERER *Clusterer) {
695  ClusteringContext context;
696  HEAPENTRY HeapEntry;
697  TEMPCLUSTER *PotentialCluster;
698 
699  // each sample and its nearest neighbor form a "potential" cluster
700  // save these in a heap with the "best" potential clusters on top
701  context.tree = Clusterer->KDTree;
702  context.candidates = (TEMPCLUSTER *)
703  Emalloc(Clusterer->NumberOfSamples * sizeof(TEMPCLUSTER));
704  context.next = 0;
705  context.heap = MakeHeap(Clusterer->NumberOfSamples);
706  KDWalk(context.tree, (void_proc)MakePotentialClusters, &context);
707 
708  // form potential clusters into actual clusters - always do "best" first
709  while (GetTopOfHeap(context.heap, &HeapEntry) != EMPTY) {
710  PotentialCluster = (TEMPCLUSTER *)HeapEntry.Data;
711 
712  // if main cluster of potential cluster is already in another cluster
713  // then we don't need to worry about it
714  if (PotentialCluster->Cluster->Clustered) {
715  continue;
716  }
717 
718  // if main cluster is not yet clustered, but its nearest neighbor is
719  // then we must find a new nearest neighbor
720  else if (PotentialCluster->Neighbor->Clustered) {
721  PotentialCluster->Neighbor =
722  FindNearestNeighbor(context.tree, PotentialCluster->Cluster,
723  &HeapEntry.Key);
724  if (PotentialCluster->Neighbor != NULL) {
725  HeapStore(context.heap, &HeapEntry);
726  }
727  }
728 
729  // if neither cluster is already clustered, form permanent cluster
730  else {
731  PotentialCluster->Cluster =
732  MakeNewCluster(Clusterer, PotentialCluster);
733  PotentialCluster->Neighbor =
734  FindNearestNeighbor(context.tree, PotentialCluster->Cluster,
735  &HeapEntry.Key);
736  if (PotentialCluster->Neighbor != NULL) {
737  HeapStore(context.heap, &HeapEntry);
738  }
739  }
740  }
741 
742  // the root node in the cluster tree is now the only node in the kd-tree
743  Clusterer->Root = (CLUSTER *) RootOf(Clusterer->KDTree);
744 
745  // free up the memory used by the K-D tree, heap, and temp clusters
746  FreeKDTree(context.tree);
747  Clusterer->KDTree = NULL;
748  FreeHeap(context.heap);
749  memfree(context.candidates);
750 } // CreateClusterTree
751 
752 
765  CLUSTER *Cluster, inT32 Level) {
766  HEAPENTRY HeapEntry;
767  int next = context->next;
768  context->candidates[next].Cluster = Cluster;
769  HeapEntry.Data = (char *) &(context->candidates[next]);
770  context->candidates[next].Neighbor =
771  FindNearestNeighbor(context->tree,
772  context->candidates[next].Cluster,
773  &HeapEntry.Key);
774  if (context->candidates[next].Neighbor != NULL) {
775  HeapStore(context->heap, &HeapEntry);
776  context->next++;
777  }
778 } // MakePotentialClusters
779 
780 
797 CLUSTER *
798 FindNearestNeighbor(KDTREE * Tree, CLUSTER * Cluster, FLOAT32 * Distance)
799 #define MAXNEIGHBORS 2
800 #define MAXDISTANCE MAX_FLOAT32
801 {
802  CLUSTER *Neighbor[MAXNEIGHBORS];
803  FLOAT32 Dist[MAXNEIGHBORS];
804  int NumberOfNeighbors;
805  inT32 i;
806  CLUSTER *BestNeighbor;
807 
808  // find the 2 nearest neighbors of the cluster
810  &NumberOfNeighbors, (void **)Neighbor, Dist);
811 
812  // search for the nearest neighbor that is not the cluster itself
813  *Distance = MAXDISTANCE;
814  BestNeighbor = NULL;
815  for (i = 0; i < NumberOfNeighbors; i++) {
816  if ((Dist[i] < *Distance) && (Neighbor[i] != Cluster)) {
817  *Distance = Dist[i];
818  BestNeighbor = Neighbor[i];
819  }
820  }
821  return BestNeighbor;
822 } // FindNearestNeighbor
823 
824 
837 CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster) {
838  CLUSTER *Cluster;
839 
840  // allocate the new cluster and initialize it
841  Cluster = (CLUSTER *) Emalloc(
842  sizeof(CLUSTER) + (Clusterer->SampleSize - 1) * sizeof(FLOAT32));
843  Cluster->Clustered = FALSE;
844  Cluster->Prototype = FALSE;
845  Cluster->Left = TempCluster->Cluster;
846  Cluster->Right = TempCluster->Neighbor;
847  Cluster->CharID = -1;
848 
849  // mark the old clusters as "clustered" and delete them from the kd-tree
850  Cluster->Left->Clustered = TRUE;
851  Cluster->Right->Clustered = TRUE;
852  KDDelete(Clusterer->KDTree, Cluster->Left->Mean, Cluster->Left);
853  KDDelete(Clusterer->KDTree, Cluster->Right->Mean, Cluster->Right);
854 
855  // compute the mean and sample count for the new cluster
856  Cluster->SampleCount =
857  MergeClusters(Clusterer->SampleSize, Clusterer->ParamDesc,
858  Cluster->Left->SampleCount, Cluster->Right->SampleCount,
859  Cluster->Mean, Cluster->Left->Mean, Cluster->Right->Mean);
860 
861  // add the new cluster to the KD tree
862  KDStore(Clusterer->KDTree, Cluster->Mean, Cluster);
863  return Cluster;
864 } // MakeNewCluster
865 
866 
883  PARAM_DESC ParamDesc[],
884  inT32 n1,
885  inT32 n2,
886  FLOAT32 m[],
887  FLOAT32 m1[], FLOAT32 m2[]) {
888  inT32 i, n;
889 
890  n = n1 + n2;
891  for (i = N; i > 0; i--, ParamDesc++, m++, m1++, m2++) {
892  if (ParamDesc->Circular) {
893  // if distance between means is greater than allowed
894  // reduce upper point by one "rotation" to compute mean
895  // then normalize the mean back into the accepted range
896  if ((*m2 - *m1) > ParamDesc->HalfRange) {
897  *m = (n1 * *m1 + n2 * (*m2 - ParamDesc->Range)) / n;
898  if (*m < ParamDesc->Min)
899  *m += ParamDesc->Range;
900  }
901  else if ((*m1 - *m2) > ParamDesc->HalfRange) {
902  *m = (n1 * (*m1 - ParamDesc->Range) + n2 * *m2) / n;
903  if (*m < ParamDesc->Min)
904  *m += ParamDesc->Range;
905  }
906  else
907  *m = (n1 * *m1 + n2 * *m2) / n;
908  }
909  else
910  *m = (n1 * *m1 + n2 * *m2) / n;
911  }
912  return n;
913 } // MergeClusters
914 
915 
928  LIST ClusterStack = NIL_LIST;
929  CLUSTER *Cluster;
930  PROTOTYPE *Prototype;
931 
932  // use a stack to keep track of clusters waiting to be processed
933  // initially the only cluster on the stack is the root cluster
934  if (Clusterer->Root != NULL)
935  ClusterStack = push (NIL_LIST, Clusterer->Root);
936 
937  // loop until we have analyzed all clusters which are potential prototypes
938  while (ClusterStack != NIL_LIST) {
939  // remove the next cluster to be analyzed from the stack
940  // try to make a prototype from the cluster
941  // if successful, put it on the proto list, else split the cluster
942  Cluster = (CLUSTER *) first_node (ClusterStack);
943  ClusterStack = pop (ClusterStack);
944  Prototype = MakePrototype(Clusterer, Config, Cluster);
945  if (Prototype != NULL) {
946  Clusterer->ProtoList = push (Clusterer->ProtoList, Prototype);
947  }
948  else {
949  ClusterStack = push (ClusterStack, Cluster->Right);
950  ClusterStack = push (ClusterStack, Cluster->Left);
951  }
952  }
953 } // ComputePrototypes
954 
955 
976  CLUSTER *Cluster) {
977  STATISTICS *Statistics;
978  PROTOTYPE *Proto;
979  BUCKETS *Buckets;
980 
981  // filter out clusters which contain samples from the same character
982  if (MultipleCharSamples (Clusterer, Cluster, Config->MaxIllegal))
983  return NULL;
984 
985  // compute the covariance matrix and ranges for the cluster
986  Statistics =
987  ComputeStatistics(Clusterer->SampleSize, Clusterer->ParamDesc, Cluster);
988 
989  // check for degenerate clusters which need not be analyzed further
990  // note that the MinSamples test assumes that all clusters with multiple
991  // character samples have been removed (as above)
992  Proto = MakeDegenerateProto(
993  Clusterer->SampleSize, Cluster, Statistics, Config->ProtoStyle,
994  (inT32) (Config->MinSamples * Clusterer->NumChar));
995  if (Proto != NULL) {
996  FreeStatistics(Statistics);
997  return Proto;
998  }
999  // check to ensure that all dimensions are independent
1000  if (!Independent(Clusterer->ParamDesc, Clusterer->SampleSize,
1001  Statistics->CoVariance, Config->Independence)) {
1002  FreeStatistics(Statistics);
1003  return NULL;
1004  }
1005 
1006  if (HOTELLING && Config->ProtoStyle == elliptical) {
1007  Proto = TestEllipticalProto(Clusterer, Config, Cluster, Statistics);
1008  if (Proto != NULL) {
1009  FreeStatistics(Statistics);
1010  return Proto;
1011  }
1012  }
1013 
1014  // create a histogram data structure used to evaluate distributions
1015  Buckets = GetBuckets(Clusterer, normal, Cluster->SampleCount,
1016  Config->Confidence);
1017 
1018  // create a prototype based on the statistics and test it
1019  switch (Config->ProtoStyle) {
1020  case spherical:
1021  Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);
1022  break;
1023  case elliptical:
1024  Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);
1025  break;
1026  case mixed:
1027  Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets,
1028  Config->Confidence);
1029  break;
1030  case automatic:
1031  Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);
1032  if (Proto != NULL)
1033  break;
1034  Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);
1035  if (Proto != NULL)
1036  break;
1037  Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets,
1038  Config->Confidence);
1039  break;
1040  }
1041  FreeStatistics(Statistics);
1042  return Proto;
1043 } // MakePrototype
1044 
1045 
1067 PROTOTYPE *MakeDegenerateProto( //this was MinSample
1068  uinT16 N,
1069  CLUSTER *Cluster,
1070  STATISTICS *Statistics,
1071  PROTOSTYLE Style,
1072  inT32 MinSamples) {
1073  PROTOTYPE *Proto = NULL;
1074 
1075  if (MinSamples < MINSAMPLESNEEDED)
1076  MinSamples = MINSAMPLESNEEDED;
1077 
1078  if (Cluster->SampleCount < MinSamples) {
1079  switch (Style) {
1080  case spherical:
1081  Proto = NewSphericalProto (N, Cluster, Statistics);
1082  break;
1083  case elliptical:
1084  case automatic:
1085  Proto = NewEllipticalProto (N, Cluster, Statistics);
1086  break;
1087  case mixed:
1088  Proto = NewMixedProto (N, Cluster, Statistics);
1089  break;
1090  }
1091  Proto->Significant = FALSE;
1092  }
1093  return (Proto);
1094 } // MakeDegenerateProto
1095 
1111  CLUSTER *Cluster,
1112  STATISTICS *Statistics) {
1113  // Fraction of the number of samples used as a range around 1 within
1114  // which a cluster has the magic size that allows a boost to the
1115  // FTable by kFTableBoostMargin, thus allowing clusters near the
1116  // magic size (equal to the number of sample characters) to be more
1117  // likely to stay together.
1118  const double kMagicSampleMargin = 0.0625;
1119  const double kFTableBoostMargin = 2.0;
1120 
1121  int N = Clusterer->SampleSize;
1122  CLUSTER* Left = Cluster->Left;
1123  CLUSTER* Right = Cluster->Right;
1124  if (Left == NULL || Right == NULL)
1125  return NULL;
1126  int TotalDims = Left->SampleCount + Right->SampleCount;
1127  if (TotalDims < N + 1 || TotalDims < 2)
1128  return NULL;
1129  const int kMatrixSize = N * N * sizeof(FLOAT32);
1130  FLOAT32* Covariance = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize));
1131  FLOAT32* Inverse = reinterpret_cast<FLOAT32 *>(Emalloc(kMatrixSize));
1132  FLOAT32* Delta = reinterpret_cast<FLOAT32*>(Emalloc(N * sizeof(FLOAT32)));
1133  // Compute a new covariance matrix that only uses essential features.
1134  for (int i = 0; i < N; ++i) {
1135  int row_offset = i * N;
1136  if (!Clusterer->ParamDesc[i].NonEssential) {
1137  for (int j = 0; j < N; ++j) {
1138  if (!Clusterer->ParamDesc[j].NonEssential)
1139  Covariance[j + row_offset] = Statistics->CoVariance[j + row_offset];
1140  else
1141  Covariance[j + row_offset] = 0.0f;
1142  }
1143  } else {
1144  for (int j = 0; j < N; ++j) {
1145  if (i == j)
1146  Covariance[j + row_offset] = 1.0f;
1147  else
1148  Covariance[j + row_offset] = 0.0f;
1149  }
1150  }
1151  }
1152  double err = InvertMatrix(Covariance, N, Inverse);
1153  if (err > 1) {
1154  tprintf("Clustering error: Matrix inverse failed with error %g\n", err);
1155  }
1156  int EssentialN = 0;
1157  for (int dim = 0; dim < N; ++dim) {
1158  if (!Clusterer->ParamDesc[dim].NonEssential) {
1159  Delta[dim] = Left->Mean[dim] - Right->Mean[dim];
1160  ++EssentialN;
1161  } else {
1162  Delta[dim] = 0.0f;
1163  }
1164  }
1165  // Compute Hotelling's T-squared.
1166  double Tsq = 0.0;
1167  for (int x = 0; x < N; ++x) {
1168  double temp = 0.0;
1169  for (int y = 0; y < N; ++y) {
1170  temp += Inverse[y + N*x] * Delta[y];
1171  }
1172  Tsq += Delta[x] * temp;
1173  }
1174  memfree(Covariance);
1175  memfree(Inverse);
1176  memfree(Delta);
1177  // Changed this function to match the formula in
1178  // Statistical Methods in Medical Research p 473
1179  // By Peter Armitage, Geoffrey Berry, J. N. S. Matthews.
1180  // Tsq *= Left->SampleCount * Right->SampleCount / TotalDims;
1181  double F = Tsq * (TotalDims - EssentialN - 1) / ((TotalDims - 2)*EssentialN);
1182  int Fx = EssentialN;
1183  if (Fx > FTABLE_X)
1184  Fx = FTABLE_X;
1185  --Fx;
1186  int Fy = TotalDims - EssentialN - 1;
1187  if (Fy > FTABLE_Y)
1188  Fy = FTABLE_Y;
1189  --Fy;
1190  double FTarget = FTable[Fy][Fx];
1191  if (Config->MagicSamples > 0 &&
1192  TotalDims >= Config->MagicSamples * (1.0 - kMagicSampleMargin) &&
1193  TotalDims <= Config->MagicSamples * (1.0 + kMagicSampleMargin)) {
1194  // Give magic-sized clusters a magic FTable boost.
1195  FTarget += kFTableBoostMargin;
1196  }
1197  if (F < FTarget) {
1198  return NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics);
1199  }
1200  return NULL;
1201 }
1202 
1203 /* MakeSphericalProto *******************************************************
1204 Parameters: Clusterer data struct containing samples being clustered
1205  Cluster cluster to be made into a spherical prototype
1206  Statistics statistical info about cluster
1207  Buckets histogram struct used to analyze distribution
1208 Operation: This routine tests the specified cluster to see if it can
1209  be approximated by a spherical normal distribution. If it
1210  can be, then a new prototype is formed and returned to the
1211  caller. If it can't be, then NULL is returned to the caller.
1212 Return: Pointer to new spherical prototype or NULL.
1213 Exceptions: None
1214 History: 6/1/89, DSJ, Created.
1215 ******************************************************************************/
1217  CLUSTER *Cluster,
1218  STATISTICS *Statistics,
1219  BUCKETS *Buckets) {
1220  PROTOTYPE *Proto = NULL;
1221  int i;
1222 
1223  // check that each dimension is a normal distribution
1224  for (i = 0; i < Clusterer->SampleSize; i++) {
1225  if (Clusterer->ParamDesc[i].NonEssential)
1226  continue;
1227 
1228  FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1229  Cluster->Mean[i],
1230  sqrt ((FLOAT64) (Statistics->AvgVariance)));
1231  if (!DistributionOK (Buckets))
1232  break;
1233  }
1234  // if all dimensions matched a normal distribution, make a proto
1235  if (i >= Clusterer->SampleSize)
1236  Proto = NewSphericalProto (Clusterer->SampleSize, Cluster, Statistics);
1237  return (Proto);
1238 } // MakeSphericalProto
1239 
1240 
1255  CLUSTER *Cluster,
1256  STATISTICS *Statistics,
1257  BUCKETS *Buckets) {
1258  PROTOTYPE *Proto = NULL;
1259  int i;
1260 
1261  // check that each dimension is a normal distribution
1262  for (i = 0; i < Clusterer->SampleSize; i++) {
1263  if (Clusterer->ParamDesc[i].NonEssential)
1264  continue;
1265 
1266  FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1267  Cluster->Mean[i],
1268  sqrt ((FLOAT64) Statistics->
1269  CoVariance[i * (Clusterer->SampleSize + 1)]));
1270  if (!DistributionOK (Buckets))
1271  break;
1272  }
1273  // if all dimensions matched a normal distribution, make a proto
1274  if (i >= Clusterer->SampleSize)
1275  Proto = NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics);
1276  return (Proto);
1277 } // MakeEllipticalProto
1278 
1279 
1299  CLUSTER *Cluster,
1300  STATISTICS *Statistics,
1301  BUCKETS *NormalBuckets,
1302  FLOAT64 Confidence) {
1303  PROTOTYPE *Proto;
1304  int i;
1305  BUCKETS *UniformBuckets = NULL;
1306  BUCKETS *RandomBuckets = NULL;
1307 
1308  // create a mixed proto to work on - initially assume all dimensions normal*/
1309  Proto = NewMixedProto (Clusterer->SampleSize, Cluster, Statistics);
1310 
1311  // find the proper distribution for each dimension
1312  for (i = 0; i < Clusterer->SampleSize; i++) {
1313  if (Clusterer->ParamDesc[i].NonEssential)
1314  continue;
1315 
1316  FillBuckets (NormalBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1317  Proto->Mean[i],
1318  sqrt ((FLOAT64) Proto->Variance.Elliptical[i]));
1319  if (DistributionOK (NormalBuckets))
1320  continue;
1321 
1322  if (RandomBuckets == NULL)
1323  RandomBuckets =
1324  GetBuckets(Clusterer, D_random, Cluster->SampleCount, Confidence);
1325  MakeDimRandom (i, Proto, &(Clusterer->ParamDesc[i]));
1326  FillBuckets (RandomBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1327  Proto->Mean[i], Proto->Variance.Elliptical[i]);
1328  if (DistributionOK (RandomBuckets))
1329  continue;
1330 
1331  if (UniformBuckets == NULL)
1332  UniformBuckets =
1333  GetBuckets(Clusterer, uniform, Cluster->SampleCount, Confidence);
1334  MakeDimUniform(i, Proto, Statistics);
1335  FillBuckets (UniformBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1336  Proto->Mean[i], Proto->Variance.Elliptical[i]);
1337  if (DistributionOK (UniformBuckets))
1338  continue;
1339  break;
1340  }
1341  // if any dimension failed to match a distribution, discard the proto
1342  if (i < Clusterer->SampleSize) {
1343  FreePrototype(Proto);
1344  Proto = NULL;
1345  }
1346  return (Proto);
1347 } // MakeMixedProto
1348 
1349 
1350 /* MakeDimRandom *************************************************************
1351 Parameters: i index of dimension to be changed
1352  Proto prototype whose dimension is to be altered
1353  ParamDesc description of specified dimension
1354 Operation: This routine alters the ith dimension of the specified
1355  mixed prototype to be D_random.
1356 Return: None
1357 Exceptions: None
1358 History: 6/20/89, DSJ, Created.
1359 ******************************************************************************/
1360 void MakeDimRandom(uinT16 i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc) {
1361  Proto->Distrib[i] = D_random;
1362  Proto->Mean[i] = ParamDesc->MidRange;
1363  Proto->Variance.Elliptical[i] = ParamDesc->HalfRange;
1364 
1365  // subtract out the previous magnitude of this dimension from the total
1366  Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
1367  Proto->Magnitude.Elliptical[i] = 1.0 / ParamDesc->Range;
1368  Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
1369  Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1370 
1371  // note that the proto Weight is irrelevant for D_random protos
1372 } // MakeDimRandom
1373 
1374 
1385 void MakeDimUniform(uinT16 i, PROTOTYPE *Proto, STATISTICS *Statistics) {
1386  Proto->Distrib[i] = uniform;
1387  Proto->Mean[i] = Proto->Cluster->Mean[i] +
1388  (Statistics->Min[i] + Statistics->Max[i]) / 2;
1389  Proto->Variance.Elliptical[i] =
1390  (Statistics->Max[i] - Statistics->Min[i]) / 2;
1391  if (Proto->Variance.Elliptical[i] < MINVARIANCE)
1392  Proto->Variance.Elliptical[i] = MINVARIANCE;
1393 
1394  // subtract out the previous magnitude of this dimension from the total
1395  Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
1396  Proto->Magnitude.Elliptical[i] =
1397  1.0 / (2.0 * Proto->Variance.Elliptical[i]);
1398  Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
1399  Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1400 
1401  // note that the proto Weight is irrelevant for uniform protos
1402 } // MakeDimUniform
1403 
1404 
1421 STATISTICS *
1422 ComputeStatistics (inT16 N, PARAM_DESC ParamDesc[], CLUSTER * Cluster) {
1423  STATISTICS *Statistics;
1424  int i, j;
1425  FLOAT32 *CoVariance;
1426  FLOAT32 *Distance;
1427  LIST SearchState;
1428  SAMPLE *Sample;
1429  uinT32 SampleCountAdjustedForBias;
1430 
1431  // allocate memory to hold the statistics results
1432  Statistics = (STATISTICS *) Emalloc (sizeof (STATISTICS));
1433  Statistics->CoVariance = (FLOAT32 *) Emalloc (N * N * sizeof (FLOAT32));
1434  Statistics->Min = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1435  Statistics->Max = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1436 
1437  // allocate temporary memory to hold the sample to mean distances
1438  Distance = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1439 
1440  // initialize the statistics
1441  Statistics->AvgVariance = 1.0;
1442  CoVariance = Statistics->CoVariance;
1443  for (i = 0; i < N; i++) {
1444  Statistics->Min[i] = 0.0;
1445  Statistics->Max[i] = 0.0;
1446  for (j = 0; j < N; j++, CoVariance++)
1447  *CoVariance = 0;
1448  }
1449  // find each sample in the cluster and merge it into the statistics
1450  InitSampleSearch(SearchState, Cluster);
1451  while ((Sample = NextSample (&SearchState)) != NULL) {
1452  for (i = 0; i < N; i++) {
1453  Distance[i] = Sample->Mean[i] - Cluster->Mean[i];
1454  if (ParamDesc[i].Circular) {
1455  if (Distance[i] > ParamDesc[i].HalfRange)
1456  Distance[i] -= ParamDesc[i].Range;
1457  if (Distance[i] < -ParamDesc[i].HalfRange)
1458  Distance[i] += ParamDesc[i].Range;
1459  }
1460  if (Distance[i] < Statistics->Min[i])
1461  Statistics->Min[i] = Distance[i];
1462  if (Distance[i] > Statistics->Max[i])
1463  Statistics->Max[i] = Distance[i];
1464  }
1465  CoVariance = Statistics->CoVariance;
1466  for (i = 0; i < N; i++)
1467  for (j = 0; j < N; j++, CoVariance++)
1468  *CoVariance += Distance[i] * Distance[j];
1469  }
1470  // normalize the variances by the total number of samples
1471  // use SampleCount-1 instead of SampleCount to get an unbiased estimate
1472  // also compute the geometic mean of the diagonal variances
1473  // ensure that clusters with only 1 sample are handled correctly
1474  if (Cluster->SampleCount > 1)
1475  SampleCountAdjustedForBias = Cluster->SampleCount - 1;
1476  else
1477  SampleCountAdjustedForBias = 1;
1478  CoVariance = Statistics->CoVariance;
1479  for (i = 0; i < N; i++)
1480  for (j = 0; j < N; j++, CoVariance++) {
1481  *CoVariance /= SampleCountAdjustedForBias;
1482  if (j == i) {
1483  if (*CoVariance < MINVARIANCE)
1484  *CoVariance = MINVARIANCE;
1485  Statistics->AvgVariance *= *CoVariance;
1486  }
1487  }
1488  Statistics->AvgVariance = (float)pow((double)Statistics->AvgVariance,
1489  1.0 / N);
1490 
1491  // release temporary memory and return
1492  memfree(Distance);
1493  return (Statistics);
1494 } // ComputeStatistics
1495 
1496 
1511  CLUSTER *Cluster,
1512  STATISTICS *Statistics) {
1513  PROTOTYPE *Proto;
1514 
1515  Proto = NewSimpleProto (N, Cluster);
1516 
1517  Proto->Variance.Spherical = Statistics->AvgVariance;
1518  if (Proto->Variance.Spherical < MINVARIANCE)
1519  Proto->Variance.Spherical = MINVARIANCE;
1520 
1521  Proto->Magnitude.Spherical =
1522  1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Spherical));
1523  Proto->TotalMagnitude = (float)pow((double)Proto->Magnitude.Spherical,
1524  (double) N);
1525  Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical;
1526  Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1527 
1528  return (Proto);
1529 } // NewSphericalProto
1530 
1531 
1545  CLUSTER *Cluster,
1546  STATISTICS *Statistics) {
1547  PROTOTYPE *Proto;
1548  FLOAT32 *CoVariance;
1549  int i;
1550 
1551  Proto = NewSimpleProto (N, Cluster);
1552  Proto->Variance.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1553  Proto->Magnitude.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1554  Proto->Weight.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1555 
1556  CoVariance = Statistics->CoVariance;
1557  Proto->TotalMagnitude = 1.0;
1558  for (i = 0; i < N; i++, CoVariance += N + 1) {
1559  Proto->Variance.Elliptical[i] = *CoVariance;
1560  if (Proto->Variance.Elliptical[i] < MINVARIANCE)
1561  Proto->Variance.Elliptical[i] = MINVARIANCE;
1562 
1563  Proto->Magnitude.Elliptical[i] =
1564  1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Elliptical[i]));
1565  Proto->Weight.Elliptical[i] = 1.0 / Proto->Variance.Elliptical[i];
1566  Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
1567  }
1568  Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1569  Proto->Style = elliptical;
1570  return (Proto);
1571 } // NewEllipticalProto
1572 
1573 
1589 PROTOTYPE *NewMixedProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics) {
1590  PROTOTYPE *Proto;
1591  int i;
1592 
1593  Proto = NewEllipticalProto (N, Cluster, Statistics);
1594  Proto->Distrib = (DISTRIBUTION *) Emalloc (N * sizeof (DISTRIBUTION));
1595 
1596  for (i = 0; i < N; i++) {
1597  Proto->Distrib[i] = normal;
1598  }
1599  Proto->Style = mixed;
1600  return (Proto);
1601 } // NewMixedProto
1602 
1603 
1615  PROTOTYPE *Proto;
1616  int i;
1617 
1618  Proto = (PROTOTYPE *) Emalloc (sizeof (PROTOTYPE));
1619  Proto->Mean = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1620 
1621  for (i = 0; i < N; i++)
1622  Proto->Mean[i] = Cluster->Mean[i];
1623  Proto->Distrib = NULL;
1624 
1625  Proto->Significant = TRUE;
1626  Proto->Merged = FALSE;
1627  Proto->Style = spherical;
1628  Proto->NumSamples = Cluster->SampleCount;
1629  Proto->Cluster = Cluster;
1630  Proto->Cluster->Prototype = TRUE;
1631  return (Proto);
1632 } // NewSimpleProto
1633 
1634 
1655 BOOL8
1657 inT16 N, FLOAT32 * CoVariance, FLOAT32 Independence) {
1658  int i, j;
1659  FLOAT32 *VARii; // points to ith on-diagonal element
1660  FLOAT32 *VARjj; // points to jth on-diagonal element
1661  FLOAT32 CorrelationCoeff;
1662 
1663  VARii = CoVariance;
1664  for (i = 0; i < N; i++, VARii += N + 1) {
1665  if (ParamDesc[i].NonEssential)
1666  continue;
1667 
1668  VARjj = VARii + N + 1;
1669  CoVariance = VARii + 1;
1670  for (j = i + 1; j < N; j++, CoVariance++, VARjj += N + 1) {
1671  if (ParamDesc[j].NonEssential)
1672  continue;
1673 
1674  if ((*VARii == 0.0) || (*VARjj == 0.0))
1675  CorrelationCoeff = 0.0;
1676  else
1677  CorrelationCoeff =
1678  sqrt (sqrt (*CoVariance * *CoVariance / (*VARii * *VARjj)));
1679  if (CorrelationCoeff > Independence)
1680  return (FALSE);
1681  }
1682  }
1683  return (TRUE);
1684 } // Independent
1685 
1686 
1706  DISTRIBUTION Distribution,
1707  uinT32 SampleCount,
1708  FLOAT64 Confidence) {
1709  // Get an old bucket structure with the same number of buckets.
1710  uinT16 NumberOfBuckets = OptimumNumberOfBuckets(SampleCount);
1711  BUCKETS *Buckets =
1712  clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS];
1713 
1714  // If a matching bucket structure is not found, make one and save it.
1715  if (Buckets == NULL) {
1716  Buckets = MakeBuckets(Distribution, SampleCount, Confidence);
1717  clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS] =
1718  Buckets;
1719  } else {
1720  // Just adjust the existing buckets.
1721  if (SampleCount != Buckets->SampleCount)
1722  AdjustBuckets(Buckets, SampleCount);
1723  if (Confidence != Buckets->Confidence) {
1724  Buckets->Confidence = Confidence;
1725  Buckets->ChiSquared = ComputeChiSquared(
1726  DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets),
1727  Confidence);
1728  }
1729  InitBuckets(Buckets);
1730  }
1731  return Buckets;
1732 } // GetBuckets
1733 
1734 
1756  uinT32 SampleCount,
1757  FLOAT64 Confidence) {
1758  const DENSITYFUNC DensityFunction[] =
1759  { NormalDensity, UniformDensity, UniformDensity };
1760  int i, j;
1761  BUCKETS *Buckets;
1762  FLOAT64 BucketProbability;
1763  FLOAT64 NextBucketBoundary;
1764  FLOAT64 Probability;
1765  FLOAT64 ProbabilityDelta;
1766  FLOAT64 LastProbDensity;
1767  FLOAT64 ProbDensity;
1768  uinT16 CurrentBucket;
1769  BOOL8 Symmetrical;
1770 
1771  // allocate memory needed for data structure
1772  Buckets = reinterpret_cast<BUCKETS*>(Emalloc(sizeof(BUCKETS)));
1773  Buckets->NumberOfBuckets = OptimumNumberOfBuckets(SampleCount);
1774  Buckets->SampleCount = SampleCount;
1775  Buckets->Confidence = Confidence;
1776  Buckets->Count = reinterpret_cast<uinT32*>(
1777  Emalloc(Buckets->NumberOfBuckets * sizeof(uinT32)));
1778  Buckets->ExpectedCount = reinterpret_cast<FLOAT32*>(
1779  Emalloc(Buckets->NumberOfBuckets * sizeof(FLOAT32)));
1780 
1781  // initialize simple fields
1782  Buckets->Distribution = Distribution;
1783  for (i = 0; i < Buckets->NumberOfBuckets; i++) {
1784  Buckets->Count[i] = 0;
1785  Buckets->ExpectedCount[i] = 0.0;
1786  }
1787 
1788  // all currently defined distributions are symmetrical
1789  Symmetrical = TRUE;
1790  Buckets->ChiSquared = ComputeChiSquared(
1791  DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets), Confidence);
1792 
1793  if (Symmetrical) {
1794  // allocate buckets so that all have approx. equal probability
1795  BucketProbability = 1.0 / (FLOAT64) (Buckets->NumberOfBuckets);
1796 
1797  // distribution is symmetric so fill in upper half then copy
1798  CurrentBucket = Buckets->NumberOfBuckets / 2;
1799  if (Odd (Buckets->NumberOfBuckets))
1800  NextBucketBoundary = BucketProbability / 2;
1801  else
1802  NextBucketBoundary = BucketProbability;
1803 
1804  Probability = 0.0;
1805  LastProbDensity =
1806  (*DensityFunction[(int) Distribution]) (BUCKETTABLESIZE / 2);
1807  for (i = BUCKETTABLESIZE / 2; i < BUCKETTABLESIZE; i++) {
1808  ProbDensity = (*DensityFunction[(int) Distribution]) (i + 1);
1809  ProbabilityDelta = Integral (LastProbDensity, ProbDensity, 1.0);
1810  Probability += ProbabilityDelta;
1811  if (Probability > NextBucketBoundary) {
1812  if (CurrentBucket < Buckets->NumberOfBuckets - 1)
1813  CurrentBucket++;
1814  NextBucketBoundary += BucketProbability;
1815  }
1816  Buckets->Bucket[i] = CurrentBucket;
1817  Buckets->ExpectedCount[CurrentBucket] +=
1818  (FLOAT32) (ProbabilityDelta * SampleCount);
1819  LastProbDensity = ProbDensity;
1820  }
1821  // place any leftover probability into the last bucket
1822  Buckets->ExpectedCount[CurrentBucket] +=
1823  (FLOAT32) ((0.5 - Probability) * SampleCount);
1824 
1825  // copy upper half of distribution to lower half
1826  for (i = 0, j = BUCKETTABLESIZE - 1; i < j; i++, j--)
1827  Buckets->Bucket[i] =
1828  Mirror(Buckets->Bucket[j], Buckets->NumberOfBuckets);
1829 
1830  // copy upper half of expected counts to lower half
1831  for (i = 0, j = Buckets->NumberOfBuckets - 1; i <= j; i++, j--)
1832  Buckets->ExpectedCount[i] += Buckets->ExpectedCount[j];
1833  }
1834  return Buckets;
1835 } // MakeBuckets
1836 
1837 
1838 //---------------------------------------------------------------------------
1840 /*
1841  ** Parameters:
1842  ** SampleCount number of samples to be tested
1843  ** Operation:
1844  ** This routine computes the optimum number of histogram
1845  ** buckets that should be used in a chi-squared goodness of
1846  ** fit test for the specified number of samples. The optimum
1847  ** number is computed based on Table 4.1 on pg. 147 of
1848  ** "Measurement and Analysis of Random Data" by Bendat & Piersol.
1849  ** Linear interpolation is used to interpolate between table
1850  ** values. The table is intended for a 0.05 level of
1851  ** significance (alpha). This routine assumes that it is
1852  ** equally valid for other alpha's, which may not be true.
1853  ** Return:
1854  ** Optimum number of histogram buckets
1855  ** Exceptions:
1856  ** None
1857  ** History:
1858  ** 6/5/89, DSJ, Created.
1859  */
1860  uinT8 Last, Next;
1861  FLOAT32 Slope;
1862 
1863  if (SampleCount < kCountTable[0])
1864  return kBucketsTable[0];
1865 
1866  for (Last = 0, Next = 1; Next < LOOKUPTABLESIZE; Last++, Next++) {
1867  if (SampleCount <= kCountTable[Next]) {
1868  Slope = (FLOAT32) (kBucketsTable[Next] - kBucketsTable[Last]) /
1869  (FLOAT32) (kCountTable[Next] - kCountTable[Last]);
1870  return ((uinT16) (kBucketsTable[Last] +
1871  Slope * (SampleCount - kCountTable[Last])));
1872  }
1873  }
1874  return kBucketsTable[Last];
1875 } // OptimumNumberOfBuckets
1876 
1877 
1878 //---------------------------------------------------------------------------
1879 FLOAT64
1881 /*
1882  ** Parameters:
1883  ** DegreesOfFreedom determines shape of distribution
1884  ** Alpha probability of right tail
1885  ** Operation:
1886  ** This routine computes the chi-squared value which will
1887  ** leave a cumulative probability of Alpha in the right tail
1888  ** of a chi-squared distribution with the specified number of
1889  ** degrees of freedom. Alpha must be between 0 and 1.
1890  ** DegreesOfFreedom must be even. The routine maintains an
1891  ** array of lists. Each list corresponds to a different
1892  ** number of degrees of freedom. Each entry in the list
1893  ** corresponds to a different alpha value and its corresponding
1894  ** chi-squared value. Therefore, once a particular chi-squared
1895  ** value is computed, it is stored in the list and never
1896  ** needs to be computed again.
1897  ** Return: Desired chi-squared value
1898  ** Exceptions: none
1899  ** History: 6/5/89, DSJ, Created.
1900  */
1901 #define CHIACCURACY 0.01
1902 #define MINALPHA (1e-200)
1903 {
1904  static LIST ChiWith[MAXDEGREESOFFREEDOM + 1];
1905 
1906  CHISTRUCT *OldChiSquared;
1907  CHISTRUCT SearchKey;
1908 
1909  // limit the minimum alpha that can be used - if alpha is too small
1910  // it may not be possible to compute chi-squared.
1911  Alpha = ClipToRange(Alpha, MINALPHA, 1.0);
1912  if (Odd (DegreesOfFreedom))
1913  DegreesOfFreedom++;
1914 
1915  /* find the list of chi-squared values which have already been computed
1916  for the specified number of degrees of freedom. Search the list for
1917  the desired chi-squared. */
1918  SearchKey.Alpha = Alpha;
1919  OldChiSquared = (CHISTRUCT *) first_node (search (ChiWith[DegreesOfFreedom],
1920  &SearchKey, AlphaMatch));
1921 
1922  if (OldChiSquared == NULL) {
1923  OldChiSquared = NewChiStruct (DegreesOfFreedom, Alpha);
1924  OldChiSquared->ChiSquared = Solve (ChiArea, OldChiSquared,
1925  (FLOAT64) DegreesOfFreedom,
1926  (FLOAT64) CHIACCURACY);
1927  ChiWith[DegreesOfFreedom] = push (ChiWith[DegreesOfFreedom],
1928  OldChiSquared);
1929  }
1930  else {
1931  // further optimization might move OldChiSquared to front of list
1932  }
1933 
1934  return (OldChiSquared->ChiSquared);
1935 
1936 } // ComputeChiSquared
1937 
1938 
1939 //---------------------------------------------------------------------------
1941 /*
1942  ** Parameters:
1943  ** x number to compute the normal probability density for
1944  ** Globals:
1945  ** kNormalMean mean of a discrete normal distribution
1946  ** kNormalVariance variance of a discrete normal distribution
1947  ** kNormalMagnitude magnitude of a discrete normal distribution
1948  ** Operation:
1949  ** This routine computes the probability density function
1950  ** of a discrete normal distribution defined by the global
1951  ** variables kNormalMean, kNormalVariance, and kNormalMagnitude.
1952  ** Normal magnitude could, of course, be computed in terms of
1953  ** the normal variance but it is precomputed for efficiency.
1954  ** Return:
1955  ** The value of the normal distribution at x.
1956  ** Exceptions:
1957  ** None
1958  ** History:
1959  ** 6/4/89, DSJ, Created.
1960  */
1961  FLOAT64 Distance;
1962 
1963  Distance = x - kNormalMean;
1964  return kNormalMagnitude * exp(-0.5 * Distance * Distance / kNormalVariance);
1965 } // NormalDensity
1966 
1967 
1968 //---------------------------------------------------------------------------
1970 /*
1971  ** Parameters:
1972  ** x number to compute the uniform probability density for
1973  ** Operation:
1974  ** This routine computes the probability density function
1975  ** of a uniform distribution at the specified point. The
1976  ** range of the distribution is from 0 to BUCKETTABLESIZE.
1977  ** Return:
1978  ** The value of the uniform distribution at x.
1979  ** Exceptions:
1980  ** None
1981  ** History:
1982  ** 6/5/89, DSJ, Created.
1983  */
1984  static FLOAT64 UniformDistributionDensity = (FLOAT64) 1.0 / BUCKETTABLESIZE;
1985 
1986  if ((x >= 0.0) && (x <= BUCKETTABLESIZE))
1987  return UniformDistributionDensity;
1988  else
1989  return (FLOAT64) 0.0;
1990 } // UniformDensity
1991 
1992 
1993 //---------------------------------------------------------------------------
1995 /*
1996  ** Parameters:
1997  ** f1 value of function at x1
1998  ** f2 value of function at x2
1999  ** Dx x2 - x1 (should always be positive)
2000  ** Operation:
2001  ** This routine computes a trapezoidal approximation to the
2002  ** integral of a function over a small delta in x.
2003  ** Return:
2004  ** Approximation of the integral of the function from x1 to x2.
2005  ** Exceptions:
2006  ** None
2007  ** History:
2008  ** 6/5/89, DSJ, Created.
2009  */
2010  return (f1 + f2) * Dx / 2.0;
2011 } // Integral
2012 
2013 
2014 //---------------------------------------------------------------------------
2015 void FillBuckets(BUCKETS *Buckets,
2016  CLUSTER *Cluster,
2017  uinT16 Dim,
2018  PARAM_DESC *ParamDesc,
2019  FLOAT32 Mean,
2020  FLOAT32 StdDev) {
2021 /*
2022  ** Parameters:
2023  ** Buckets histogram buckets to count samples
2024  ** Cluster cluster whose samples are being analyzed
2025  ** Dim dimension of samples which is being analyzed
2026  ** ParamDesc description of the dimension
2027  ** Mean "mean" of the distribution
2028  ** StdDev "standard deviation" of the distribution
2029  ** Operation:
2030  ** This routine counts the number of cluster samples which
2031  ** fall within the various histogram buckets in Buckets. Only
2032  ** one dimension of each sample is examined. The exact meaning
2033  ** of the Mean and StdDev parameters depends on the
2034  ** distribution which is being analyzed (this info is in the
2035  ** Buckets data structure). For normal distributions, Mean
2036  ** and StdDev have the expected meanings. For uniform and
2037  ** random distributions the Mean is the center point of the
2038  ** range and the StdDev is 1/2 the range. A dimension with
2039  ** zero standard deviation cannot be statistically analyzed.
2040  ** In this case, a pseudo-analysis is used.
2041  ** Return:
2042  ** None (the Buckets data structure is filled in)
2043  ** Exceptions:
2044  ** None
2045  ** History:
2046  ** 6/5/89, DSJ, Created.
2047  */
2048  uinT16 BucketID;
2049  int i;
2050  LIST SearchState;
2051  SAMPLE *Sample;
2052 
2053  // initialize the histogram bucket counts to 0
2054  for (i = 0; i < Buckets->NumberOfBuckets; i++)
2055  Buckets->Count[i] = 0;
2056 
2057  if (StdDev == 0.0) {
2058  /* if the standard deviation is zero, then we can't statistically
2059  analyze the cluster. Use a pseudo-analysis: samples exactly on
2060  the mean are distributed evenly across all buckets. Samples greater
2061  than the mean are placed in the last bucket; samples less than the
2062  mean are placed in the first bucket. */
2063 
2064  InitSampleSearch(SearchState, Cluster);
2065  i = 0;
2066  while ((Sample = NextSample (&SearchState)) != NULL) {
2067  if (Sample->Mean[Dim] > Mean)
2068  BucketID = Buckets->NumberOfBuckets - 1;
2069  else if (Sample->Mean[Dim] < Mean)
2070  BucketID = 0;
2071  else
2072  BucketID = i;
2073  Buckets->Count[BucketID] += 1;
2074  i++;
2075  if (i >= Buckets->NumberOfBuckets)
2076  i = 0;
2077  }
2078  }
2079  else {
2080  // search for all samples in the cluster and add to histogram buckets
2081  InitSampleSearch(SearchState, Cluster);
2082  while ((Sample = NextSample (&SearchState)) != NULL) {
2083  switch (Buckets->Distribution) {
2084  case normal:
2085  BucketID = NormalBucket (ParamDesc, Sample->Mean[Dim],
2086  Mean, StdDev);
2087  break;
2088  case D_random:
2089  case uniform:
2090  BucketID = UniformBucket (ParamDesc, Sample->Mean[Dim],
2091  Mean, StdDev);
2092  break;
2093  default:
2094  BucketID = 0;
2095  }
2096  Buckets->Count[Buckets->Bucket[BucketID]] += 1;
2097  }
2098  }
2099 } // FillBuckets
2100 
2101 
2102 //---------------------------------------------------------------------------*/
2104  FLOAT32 x,
2105  FLOAT32 Mean,
2106  FLOAT32 StdDev) {
2107 /*
2108  ** Parameters:
2109  ** ParamDesc used to identify circular dimensions
2110  ** x value to be normalized
2111  ** Mean mean of normal distribution
2112  ** StdDev standard deviation of normal distribution
2113  ** Operation:
2114  ** This routine determines which bucket x falls into in the
2115  ** discrete normal distribution defined by kNormalMean
2116  ** and kNormalStdDev. x values which exceed the range of
2117  ** the discrete distribution are clipped.
2118  ** Return:
2119  ** Bucket number into which x falls
2120  ** Exceptions:
2121  ** None
2122  ** History:
2123  ** 6/5/89, DSJ, Created.
2124  */
2125  FLOAT32 X;
2126 
2127  // wraparound circular parameters if necessary
2128  if (ParamDesc->Circular) {
2129  if (x - Mean > ParamDesc->HalfRange)
2130  x -= ParamDesc->Range;
2131  else if (x - Mean < -ParamDesc->HalfRange)
2132  x += ParamDesc->Range;
2133  }
2134 
2135  X = ((x - Mean) / StdDev) * kNormalStdDev + kNormalMean;
2136  if (X < 0)
2137  return 0;
2138  if (X > BUCKETTABLESIZE - 1)
2139  return ((uinT16) (BUCKETTABLESIZE - 1));
2140  return (uinT16) floor((FLOAT64) X);
2141 } // NormalBucket
2142 
2143 
2144 //---------------------------------------------------------------------------
2146  FLOAT32 x,
2147  FLOAT32 Mean,
2148  FLOAT32 StdDev) {
2149 /*
2150  ** Parameters:
2151  ** ParamDesc used to identify circular dimensions
2152  ** x value to be normalized
2153  ** Mean center of range of uniform distribution
2154  ** StdDev 1/2 the range of the uniform distribution
2155  ** Operation:
2156  ** This routine determines which bucket x falls into in the
2157  ** discrete uniform distribution defined by
2158  ** BUCKETTABLESIZE. x values which exceed the range of
2159  ** the discrete distribution are clipped.
2160  ** Return:
2161  ** Bucket number into which x falls
2162  ** Exceptions:
2163  ** None
2164  ** History:
2165  ** 6/5/89, DSJ, Created.
2166  */
2167  FLOAT32 X;
2168 
2169  // wraparound circular parameters if necessary
2170  if (ParamDesc->Circular) {
2171  if (x - Mean > ParamDesc->HalfRange)
2172  x -= ParamDesc->Range;
2173  else if (x - Mean < -ParamDesc->HalfRange)
2174  x += ParamDesc->Range;
2175  }
2176 
2177  X = ((x - Mean) / (2 * StdDev) * BUCKETTABLESIZE + BUCKETTABLESIZE / 2.0);
2178  if (X < 0)
2179  return 0;
2180  if (X > BUCKETTABLESIZE - 1)
2181  return (uinT16) (BUCKETTABLESIZE - 1);
2182  return (uinT16) floor((FLOAT64) X);
2183 } // UniformBucket
2184 
2185 
2186 //---------------------------------------------------------------------------
2188 /*
2189  ** Parameters:
2190  ** Buckets histogram data to perform chi-square test on
2191  ** Operation:
2192  ** This routine performs a chi-square goodness of fit test
2193  ** on the histogram data in the Buckets data structure. TRUE
2194  ** is returned if the histogram matches the probability
2195  ** distribution which was specified when the Buckets
2196  ** structure was originally created. Otherwise FALSE is
2197  ** returned.
2198  ** Return:
2199  ** TRUE if samples match distribution, FALSE otherwise
2200  ** Exceptions:
2201  ** None
2202  ** History:
2203  ** 6/5/89, DSJ, Created.
2204  */
2205  FLOAT32 FrequencyDifference;
2206  FLOAT32 TotalDifference;
2207  int i;
2208 
2209  // compute how well the histogram matches the expected histogram
2210  TotalDifference = 0.0;
2211  for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2212  FrequencyDifference = Buckets->Count[i] - Buckets->ExpectedCount[i];
2213  TotalDifference += (FrequencyDifference * FrequencyDifference) /
2214  Buckets->ExpectedCount[i];
2215  }
2216 
2217  // test to see if the difference is more than expected
2218  if (TotalDifference > Buckets->ChiSquared)
2219  return FALSE;
2220  else
2221  return TRUE;
2222 } // DistributionOK
2223 
2224 
2225 //---------------------------------------------------------------------------
2226 void FreeStatistics(STATISTICS *Statistics) {
2227 /*
2228  ** Parameters:
2229  ** Statistics pointer to data structure to be freed
2230  ** Operation:
2231  ** This routine frees the memory used by the statistics
2232  ** data structure.
2233  ** Return:
2234  ** None
2235  ** Exceptions:
2236  ** None
2237  ** History:
2238  ** 6/5/89, DSJ, Created.
2239  */
2240  memfree (Statistics->CoVariance);
2241  memfree (Statistics->Min);
2242  memfree (Statistics->Max);
2243  memfree(Statistics);
2244 } // FreeStatistics
2245 
2246 
2247 //---------------------------------------------------------------------------
2248 void FreeBuckets(BUCKETS *buckets) {
2249 /*
2250  ** Parameters:
2251  ** buckets pointer to data structure to be freed
2252  ** Operation:
2253  ** This routine properly frees the memory used by a BUCKETS.
2254  */
2255  Efree(buckets->Count);
2256  Efree(buckets->ExpectedCount);
2257  Efree(buckets);
2258 } // FreeBuckets
2259 
2260 
2261 //---------------------------------------------------------------------------
2262 void FreeCluster(CLUSTER *Cluster) {
2263 /*
2264  ** Parameters:
2265  ** Cluster pointer to cluster to be freed
2266  ** Operation:
2267  ** This routine frees the memory consumed by the specified
2268  ** cluster and all of its subclusters. This is done by
2269  ** recursive calls to FreeCluster().
2270  ** Return:
2271  ** None
2272  ** Exceptions:
2273  ** None
2274  ** History:
2275  ** 6/6/89, DSJ, Created.
2276  */
2277  if (Cluster != NULL) {
2278  FreeCluster (Cluster->Left);
2279  FreeCluster (Cluster->Right);
2280  memfree(Cluster);
2281  }
2282 } // FreeCluster
2283 
2284 
2285 //---------------------------------------------------------------------------
2286 uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets) {
2287 /*
2288  ** Parameters:
2289  ** Distribution distribution being tested for
2290  ** HistogramBuckets number of buckets in chi-square test
2291  ** Operation:
2292  ** This routine computes the degrees of freedom that should
2293  ** be used in a chi-squared test with the specified number of
2294  ** histogram buckets. The result is always rounded up to
2295  ** the next even number so that the value of chi-squared can be
2296  ** computed more easily. This will cause the value of
2297  ** chi-squared to be higher than the optimum value, resulting
2298  ** in the chi-square test being more lenient than optimum.
2299  ** Return: The number of degrees of freedom for a chi-square test
2300  ** Exceptions: none
2301  ** History: Thu Aug 3 14:04:18 1989, DSJ, Created.
2302  */
2303  static uinT8 DegreeOffsets[] = { 3, 3, 1 };
2304 
2305  uinT16 AdjustedNumBuckets;
2306 
2307  AdjustedNumBuckets = HistogramBuckets - DegreeOffsets[(int) Distribution];
2308  if (Odd (AdjustedNumBuckets))
2309  AdjustedNumBuckets++;
2310  return (AdjustedNumBuckets);
2311 
2312 } // DegreesOfFreedom
2313 
2314 
2315 //---------------------------------------------------------------------------
2316 int NumBucketsMatch(void *arg1, // BUCKETS *Histogram,
2317  void *arg2) { // uinT16 *DesiredNumberOfBuckets)
2318 /*
2319  ** Parameters:
2320  ** Histogram current histogram being tested for a match
2321  ** DesiredNumberOfBuckets match key
2322  ** Operation:
2323  ** This routine is used to search a list of histogram data
2324  ** structures to find one with the specified number of
2325  ** buckets. It is called by the list search routines.
2326  ** Return: TRUE if Histogram matches DesiredNumberOfBuckets
2327  ** Exceptions: none
2328  ** History: Thu Aug 3 14:17:33 1989, DSJ, Created.
2329  */
2330  BUCKETS *Histogram = (BUCKETS *) arg1;
2331  uinT16 *DesiredNumberOfBuckets = (uinT16 *) arg2;
2332 
2333  return (*DesiredNumberOfBuckets == Histogram->NumberOfBuckets);
2334 
2335 } // NumBucketsMatch
2336 
2337 
2338 //---------------------------------------------------------------------------
2339 int ListEntryMatch(void *arg1, //ListNode
2340  void *arg2) { //Key
2341 /*
2342  ** Parameters: none
2343  ** Operation:
2344  ** This routine is used to search a list for a list node
2345  ** whose contents match Key. It is called by the list
2346  ** delete_d routine.
2347  ** Return: TRUE if ListNode matches Key
2348  ** Exceptions: none
2349  ** History: Thu Aug 3 14:23:58 1989, DSJ, Created.
2350  */
2351  return (arg1 == arg2);
2352 
2353 } // ListEntryMatch
2354 
2355 
2356 //---------------------------------------------------------------------------
2357 void AdjustBuckets(BUCKETS *Buckets, uinT32 NewSampleCount) {
2358 /*
2359  ** Parameters:
2360  ** Buckets histogram data structure to adjust
2361  ** NewSampleCount new sample count to adjust to
2362  ** Operation:
2363  ** This routine multiplies each ExpectedCount histogram entry
2364  ** by NewSampleCount/OldSampleCount so that the histogram
2365  ** is now adjusted to the new sample count.
2366  ** Return: none
2367  ** Exceptions: none
2368  ** History: Thu Aug 3 14:31:14 1989, DSJ, Created.
2369  */
2370  int i;
2371  FLOAT64 AdjustFactor;
2372 
2373  AdjustFactor = (((FLOAT64) NewSampleCount) /
2374  ((FLOAT64) Buckets->SampleCount));
2375 
2376  for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2377  Buckets->ExpectedCount[i] *= AdjustFactor;
2378  }
2379 
2380  Buckets->SampleCount = NewSampleCount;
2381 
2382 } // AdjustBuckets
2383 
2384 
2385 //---------------------------------------------------------------------------
2386 void InitBuckets(BUCKETS *Buckets) {
2387 /*
2388  ** Parameters:
2389  ** Buckets histogram data structure to init
2390  ** Operation:
2391  ** This routine sets the bucket counts in the specified histogram
2392  ** to zero.
2393  ** Return: none
2394  ** Exceptions: none
2395  ** History: Thu Aug 3 14:31:14 1989, DSJ, Created.
2396  */
2397  int i;
2398 
2399  for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2400  Buckets->Count[i] = 0;
2401  }
2402 
2403 } // InitBuckets
2404 
2405 
2406 //---------------------------------------------------------------------------
2407 int AlphaMatch(void *arg1, //CHISTRUCT *ChiStruct,
2408  void *arg2) { //CHISTRUCT *SearchKey)
2409 /*
2410  ** Parameters:
2411  ** ChiStruct chi-squared struct being tested for a match
2412  ** SearchKey chi-squared struct that is the search key
2413  ** Operation:
2414  ** This routine is used to search a list of structures which
2415  ** hold pre-computed chi-squared values for a chi-squared
2416  ** value whose corresponding alpha field matches the alpha
2417  ** field of SearchKey.
2418  ** It is called by the list search routines.
2419  ** Return: TRUE if ChiStruct's Alpha matches SearchKey's Alpha
2420  ** Exceptions: none
2421  ** History: Thu Aug 3 14:17:33 1989, DSJ, Created.
2422  */
2423  CHISTRUCT *ChiStruct = (CHISTRUCT *) arg1;
2424  CHISTRUCT *SearchKey = (CHISTRUCT *) arg2;
2425 
2426  return (ChiStruct->Alpha == SearchKey->Alpha);
2427 
2428 } // AlphaMatch
2429 
2430 
2431 //---------------------------------------------------------------------------
2432 CHISTRUCT *NewChiStruct(uinT16 DegreesOfFreedom, FLOAT64 Alpha) {
2433 /*
2434  ** Parameters:
2435  ** DegreesOfFreedom degrees of freedom for new chi value
2436  ** Alpha confidence level for new chi value
2437  ** Operation:
2438  ** This routine allocates a new data structure which is used
2439  ** to hold a chi-squared value along with its associated
2440  ** number of degrees of freedom and alpha value.
2441  ** Return: none
2442  ** Exceptions: none
2443  ** History: Fri Aug 4 11:04:59 1989, DSJ, Created.
2444  */
2446 
2447  NewChiStruct = (CHISTRUCT *) Emalloc (sizeof (CHISTRUCT));
2448  NewChiStruct->DegreesOfFreedom = DegreesOfFreedom;
2449  NewChiStruct->Alpha = Alpha;
2450  return (NewChiStruct);
2451 
2452 } // NewChiStruct
2453 
2454 
2455 //---------------------------------------------------------------------------
2456 FLOAT64
2457 Solve (SOLVEFUNC Function,
2458 void *FunctionParams, FLOAT64 InitialGuess, FLOAT64 Accuracy)
2459 /*
2460  ** Parameters:
2461  ** Function function whose zero is to be found
2462  ** FunctionParams arbitrary data to pass to function
2463  ** InitialGuess point to start solution search at
2464  ** Accuracy maximum allowed error
2465  ** Operation:
2466  ** This routine attempts to find an x value at which Function
2467  ** goes to zero (i.e. a root of the function ). It will only
2468  ** work correctly if a solution actually exists and there
2469  ** are no extrema between the solution and the InitialGuess.
2470  ** The algorithms used are extremely primitive.
2471  ** Return: Solution of function ( x for which f(x) = 0 ).
2472  ** Exceptions: none
2473  ** History: Fri Aug 4 11:08:59 1989, DSJ, Created.
2474  */
2475 #define INITIALDELTA 0.1
2476 #define DELTARATIO 0.1
2477 {
2478  FLOAT64 x;
2479  FLOAT64 f;
2480  FLOAT64 Slope;
2481  FLOAT64 Delta;
2482  FLOAT64 NewDelta;
2483  FLOAT64 xDelta;
2484  FLOAT64 LastPosX, LastNegX;
2485 
2486  x = InitialGuess;
2487  Delta = INITIALDELTA;
2488  LastPosX = MAX_FLOAT32;
2489  LastNegX = -MAX_FLOAT32;
2490  f = (*Function) ((CHISTRUCT *) FunctionParams, x);
2491  while (Abs (LastPosX - LastNegX) > Accuracy) {
2492  // keep track of outer bounds of current estimate
2493  if (f < 0)
2494  LastNegX = x;
2495  else
2496  LastPosX = x;
2497 
2498  // compute the approx. slope of f(x) at the current point
2499  Slope =
2500  ((*Function) ((CHISTRUCT *) FunctionParams, x + Delta) - f) / Delta;
2501 
2502  // compute the next solution guess */
2503  xDelta = f / Slope;
2504  x -= xDelta;
2505 
2506  // reduce the delta used for computing slope to be a fraction of
2507  //the amount moved to get to the new guess
2508  NewDelta = Abs (xDelta) * DELTARATIO;
2509  if (NewDelta < Delta)
2510  Delta = NewDelta;
2511 
2512  // compute the value of the function at the new guess
2513  f = (*Function) ((CHISTRUCT *) FunctionParams, x);
2514  }
2515  return (x);
2516 
2517 } // Solve
2518 
2519 
2520 //---------------------------------------------------------------------------
2522 /*
2523  ** Parameters:
2524  ** ChiParams contains degrees of freedom and alpha
2525  ** x value of chi-squared to evaluate
2526  ** Operation:
2527  ** This routine computes the area under a chi density curve
2528  ** from 0 to x, minus the desired area under the curve. The
2529  ** number of degrees of freedom of the chi curve is specified
2530  ** in the ChiParams structure. The desired area is also
2531  ** specified in the ChiParams structure as Alpha ( or 1 minus
2532  ** the desired area ). This routine is intended to be passed
2533  ** to the Solve() function to find the value of chi-squared
2534  ** which will yield a desired area under the right tail of
2535  ** the chi density curve. The function will only work for
2536  ** even degrees of freedom. The equations are based on
2537  ** integrating the chi density curve in parts to obtain
2538  ** a series that can be used to compute the area under the
2539  ** curve.
2540  ** Return: Error between actual and desired area under the chi curve.
2541  ** Exceptions: none
2542  ** History: Fri Aug 4 12:48:41 1989, DSJ, Created.
2543  */
2544  int i, N;
2545  FLOAT64 SeriesTotal;
2546  FLOAT64 Denominator;
2547  FLOAT64 PowerOfx;
2548 
2549  N = ChiParams->DegreesOfFreedom / 2 - 1;
2550  SeriesTotal = 1;
2551  Denominator = 1;
2552  PowerOfx = 1;
2553  for (i = 1; i <= N; i++) {
2554  Denominator *= 2 * i;
2555  PowerOfx *= x;
2556  SeriesTotal += PowerOfx / Denominator;
2557  }
2558  return ((SeriesTotal * exp (-0.5 * x)) - ChiParams->Alpha);
2559 
2560 } // ChiArea
2561 
2562 
2563 //---------------------------------------------------------------------------
2564 BOOL8
2566 CLUSTER * Cluster, FLOAT32 MaxIllegal)
2567 /*
2568  ** Parameters:
2569  ** Clusterer data structure holding cluster tree
2570  ** Cluster cluster containing samples to be tested
2571  ** MaxIllegal max percentage of samples allowed to have
2572  ** more than 1 feature in the cluster
2573  ** Operation:
2574  ** This routine looks at all samples in the specified cluster.
2575  ** It computes a running estimate of the percentage of the
2576  ** charaters which have more than 1 sample in the cluster.
2577  ** When this percentage exceeds MaxIllegal, TRUE is returned.
2578  ** Otherwise FALSE is returned. The CharID
2579  ** fields must contain integers which identify the training
2580  ** characters which were used to generate the sample. One
2581  ** integer is used for each sample. The NumChar field in
2582  ** the Clusterer must contain the number of characters in the
2583  ** training set. All CharID fields must be between 0 and
2584  ** NumChar-1. The main function of this routine is to help
2585  ** identify clusters which need to be split further, i.e. if
2586  ** numerous training characters have 2 or more features which are
2587  ** contained in the same cluster, then the cluster should be
2588  ** split.
2589  ** Return: TRUE if the cluster should be split, FALSE otherwise.
2590  ** Exceptions: none
2591  ** History: Wed Aug 30 11:13:05 1989, DSJ, Created.
2592  ** 2/22/90, DSJ, Added MaxIllegal control rather than always
2593  ** splitting illegal clusters.
2594  */
2595 #define ILLEGAL_CHAR 2
2596 {
2597  static BOOL8 *CharFlags = NULL;
2598  static inT32 NumFlags = 0;
2599  int i;
2600  LIST SearchState;
2601  SAMPLE *Sample;
2602  inT32 CharID;
2603  inT32 NumCharInCluster;
2604  inT32 NumIllegalInCluster;
2605  FLOAT32 PercentIllegal;
2606 
2607  // initial estimate assumes that no illegal chars exist in the cluster
2608  NumCharInCluster = Cluster->SampleCount;
2609  NumIllegalInCluster = 0;
2610 
2611  if (Clusterer->NumChar > NumFlags) {
2612  if (CharFlags != NULL)
2613  memfree(CharFlags);
2614  NumFlags = Clusterer->NumChar;
2615  CharFlags = (BOOL8 *) Emalloc (NumFlags * sizeof (BOOL8));
2616  }
2617 
2618  for (i = 0; i < NumFlags; i++)
2619  CharFlags[i] = FALSE;
2620 
2621  // find each sample in the cluster and check if we have seen it before
2622  InitSampleSearch(SearchState, Cluster);
2623  while ((Sample = NextSample (&SearchState)) != NULL) {
2624  CharID = Sample->CharID;
2625  if (CharFlags[CharID] == FALSE) {
2626  CharFlags[CharID] = TRUE;
2627  }
2628  else {
2629  if (CharFlags[CharID] == TRUE) {
2630  NumIllegalInCluster++;
2631  CharFlags[CharID] = ILLEGAL_CHAR;
2632  }
2633  NumCharInCluster--;
2634  PercentIllegal = (FLOAT32) NumIllegalInCluster / NumCharInCluster;
2635  if (PercentIllegal > MaxIllegal) {
2636  destroy(SearchState);
2637  return (TRUE);
2638  }
2639  }
2640  }
2641  return (FALSE);
2642 
2643 } // MultipleCharSamples
2644 
2645 // Compute the inverse of a matrix using LU decomposition with partial pivoting.
2646 // The return value is the sum of norms of the off-diagonal terms of the
2647 // product of a and inv. (A measure of the error.)
2648 double InvertMatrix(const float* input, int size, float* inv) {
2649  // Allocate memory for the 2D arrays.
2650  GENERIC_2D_ARRAY<double> U(size, size, 0.0);
2651  GENERIC_2D_ARRAY<double> U_inv(size, size, 0.0);
2652  GENERIC_2D_ARRAY<double> L(size, size, 0.0);
2653 
2654  // Initialize the working matrices. U starts as input, L as I and U_inv as O.
2655  int row;
2656  int col;
2657  for (row = 0; row < size; row++) {
2658  for (col = 0; col < size; col++) {
2659  U[row][col] = input[row*size + col];
2660  L[row][col] = row == col ? 1.0 : 0.0;
2661  U_inv[row][col] = 0.0;
2662  }
2663  }
2664 
2665  // Compute forward matrix by inversion by LU decomposition of input.
2666  for (col = 0; col < size; ++col) {
2667  // Find best pivot
2668  int best_row = 0;
2669  double best_pivot = -1.0;
2670  for (row = col; row < size; ++row) {
2671  if (Abs(U[row][col]) > best_pivot) {
2672  best_pivot = Abs(U[row][col]);
2673  best_row = row;
2674  }
2675  }
2676  // Exchange pivot rows.
2677  if (best_row != col) {
2678  for (int k = 0; k < size; ++k) {
2679  double tmp = U[best_row][k];
2680  U[best_row][k] = U[col][k];
2681  U[col][k] = tmp;
2682  tmp = L[best_row][k];
2683  L[best_row][k] = L[col][k];
2684  L[col][k] = tmp;
2685  }
2686  }
2687  // Now do the pivot itself.
2688  for (row = col + 1; row < size; ++row) {
2689  double ratio = -U[row][col] / U[col][col];
2690  for (int j = col; j < size; ++j) {
2691  U[row][j] += U[col][j] * ratio;
2692  }
2693  for (int k = 0; k < size; ++k) {
2694  L[row][k] += L[col][k] * ratio;
2695  }
2696  }
2697  }
2698  // Next invert U.
2699  for (col = 0; col < size; ++col) {
2700  U_inv[col][col] = 1.0 / U[col][col];
2701  for (row = col - 1; row >= 0; --row) {
2702  double total = 0.0;
2703  for (int k = col; k > row; --k) {
2704  total += U[row][k] * U_inv[k][col];
2705  }
2706  U_inv[row][col] = -total / U[row][row];
2707  }
2708  }
2709  // Now the answer is U_inv.L.
2710  for (row = 0; row < size; row++) {
2711  for (col = 0; col < size; col++) {
2712  double sum = 0.0;
2713  for (int k = row; k < size; ++k) {
2714  sum += U_inv[row][k] * L[k][col];
2715  }
2716  inv[row*size + col] = sum;
2717  }
2718  }
2719  // Check matrix product.
2720  double error_sum = 0.0;
2721  for (row = 0; row < size; row++) {
2722  for (col = 0; col < size; col++) {
2723  double sum = 0.0;
2724  for (int k = 0; k < size; ++k) {
2725  sum += input[row*size + k] * inv[k *size + col];
2726  }
2727  if (row != col) {
2728  error_sum += Abs(sum);
2729  }
2730  }
2731  }
2732  return error_sum;
2733 }