001 /*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements. See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License. You may obtain a copy of the License at
008 *
009 * http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017
018 package org.apache.commons.math.random;
019
020 import java.io.BufferedReader;
021 import java.io.File;
022 import java.io.FileReader;
023 import java.io.IOException;
024 import java.io.InputStreamReader;
025 import java.io.Serializable;
026 import java.net.URL;
027 import java.util.ArrayList;
028 import java.util.List;
029
030 import org.apache.commons.math.MathRuntimeException;
031 import org.apache.commons.math.stat.descriptive.StatisticalSummary;
032 import org.apache.commons.math.stat.descriptive.SummaryStatistics;
033
034 /**
035 * Implements <code>EmpiricalDistribution</code> interface. This implementation
036 * uses what amounts to the
037 * <a href="http://nedwww.ipac.caltech.edu/level5/March02/Silverman/Silver2_6.html">
038 * Variable Kernel Method</a> with Gaussian smoothing:<p>
039 * <strong>Digesting the input file</strong>
040 * <ol><li>Pass the file once to compute min and max.</li>
041 * <li>Divide the range from min-max into <code>binCount</code> "bins."</li>
042 * <li>Pass the data file again, computing bin counts and univariate
043 * statistics (mean, std dev.) for each of the bins </li>
044 * <li>Divide the interval (0,1) into subintervals associated with the bins,
045 * with the length of a bin's subinterval proportional to its count.</li></ol>
046 * <strong>Generating random values from the distribution</strong><ol>
047 * <li>Generate a uniformly distributed value in (0,1) </li>
048 * <li>Select the subinterval to which the value belongs.
049 * <li>Generate a random Gaussian value with mean = mean of the associated
050 * bin and std dev = std dev of associated bin.</li></ol></p><p>
051 *<strong>USAGE NOTES:</strong><ul>
052 *<li>The <code>binCount</code> is set by default to 1000. A good rule of thumb
053 * is to set the bin count to approximately the length of the input file divided
054 * by 10. </li>
055 *<li>The input file <i>must</i> be a plain text file containing one valid numeric
056 * entry per line.</li>
057 * </ul></p>
058 *
059 * @version $Revision: 772119 $ $Date: 2009-05-06 05:43:28 -0400 (Wed, 06 May 2009) $
060 */
061 public class EmpiricalDistributionImpl implements Serializable, EmpiricalDistribution {
062
063 /** Serializable version identifier */
064 private static final long serialVersionUID = 5729073523949762654L;
065
066 /** List of SummaryStatistics objects characterizing the bins */
067 private List<SummaryStatistics> binStats = null;
068
069 /** Sample statistics */
070 private SummaryStatistics sampleStats = null;
071
072 /** number of bins */
073 private int binCount = 1000;
074
075 /** is the distribution loaded? */
076 private boolean loaded = false;
077
078 /** upper bounds of subintervals in (0,1) "belonging" to the bins */
079 private double[] upperBounds = null;
080
081 /** RandomData instance to use in repeated calls to getNext() */
082 private RandomData randomData = new RandomDataImpl();
083
084 /**
085 * Creates a new EmpiricalDistribution with the default bin count.
086 */
087 public EmpiricalDistributionImpl() {
088 binStats = new ArrayList<SummaryStatistics>();
089 }
090
091 /**
092 * Creates a new EmpiricalDistribution with the specified bin count.
093 *
094 * @param binCount number of bins
095 */
096 public EmpiricalDistributionImpl(int binCount) {
097 this.binCount = binCount;
098 binStats = new ArrayList<SummaryStatistics>();
099 }
100
101 /**
102 * Computes the empirical distribution from the provided
103 * array of numbers.
104 *
105 * @param in the input data array
106 */
107 public void load(double[] in) {
108 DataAdapter da = new ArrayDataAdapter(in);
109 try {
110 da.computeStats();
111 fillBinStats(in);
112 } catch (Exception e) {
113 throw new MathRuntimeException(e);
114 }
115 loaded = true;
116
117 }
118
119 /**
120 * Computes the empirical distribution using data read from a URL.
121 * @param url url of the input file
122 *
123 * @throws IOException if an IO error occurs
124 */
125 public void load(URL url) throws IOException {
126 BufferedReader in =
127 new BufferedReader(new InputStreamReader(url.openStream()));
128 try {
129 DataAdapter da = new StreamDataAdapter(in);
130 try {
131 da.computeStats();
132 } catch (IOException ioe) {
133 // don't wrap exceptions which are already IOException
134 throw ioe;
135 } catch (RuntimeException rte) {
136 // don't wrap RuntimeExceptions
137 throw rte;
138 } catch (Exception e) {
139 throw MathRuntimeException.createIOException(e);
140 }
141 if (sampleStats.getN() == 0) {
142 throw MathRuntimeException.createEOFException("URL {0} contains no data",
143 url);
144 }
145 in = new BufferedReader(new InputStreamReader(url.openStream()));
146 fillBinStats(in);
147 loaded = true;
148 } finally {
149 try {
150 in.close();
151 } catch (IOException ex) {
152 // ignore
153 }
154 }
155 }
156
157 /**
158 * Computes the empirical distribution from the input file.
159 *
160 * @param file the input file
161 * @throws IOException if an IO error occurs
162 */
163 public void load(File file) throws IOException {
164 BufferedReader in = new BufferedReader(new FileReader(file));
165 try {
166 DataAdapter da = new StreamDataAdapter(in);
167 try {
168 da.computeStats();
169 } catch (IOException ioe) {
170 // don't wrap exceptions which are already IOException
171 throw ioe;
172 } catch (RuntimeException rte) {
173 // don't wrap RuntimeExceptions
174 throw rte;
175 } catch (Exception e) {
176 throw MathRuntimeException.createIOException(e);
177 }
178 in = new BufferedReader(new FileReader(file));
179 fillBinStats(in);
180 loaded = true;
181 } finally {
182 try {
183 in.close();
184 } catch (IOException ex) {
185 // ignore
186 }
187 }
188 }
189
190 /**
191 * Provides methods for computing <code>sampleStats</code> and
192 * <code>beanStats</code> abstracting the source of data.
193 */
194 private abstract class DataAdapter{
195 /**
196 * Compute bin stats.
197 *
198 * @param min minimum value
199 * @param delta grid size
200 * @throws Exception if an error occurs computing bin stats
201 */
202 public abstract void computeBinStats(double min, double delta)
203 throws Exception;
204 /**
205 * Compute sample statistics.
206 *
207 * @throws Exception if an error occurs computing sample stats
208 */
209 public abstract void computeStats() throws Exception;
210 }
211 /**
212 * Factory of <code>DataAdapter</code> objects. For every supported source
213 * of data (array of doubles, file, etc.) an instance of the proper object
214 * is returned.
215 */
216 private class DataAdapterFactory{
217 /**
218 * Creates a DataAdapter from a data object
219 *
220 * @param in object providing access to the data
221 * @return DataAdapter instance
222 */
223 public DataAdapter getAdapter(Object in) {
224 if (in instanceof BufferedReader) {
225 BufferedReader inputStream = (BufferedReader) in;
226 return new StreamDataAdapter(inputStream);
227 } else if (in instanceof double[]) {
228 double[] inputArray = (double[]) in;
229 return new ArrayDataAdapter(inputArray);
230 } else {
231 throw MathRuntimeException.createIllegalArgumentException(
232 "input data comes from unsupported datasource: {0}, " +
233 "supported sources: {1}, {2}",
234 in.getClass().getName(),
235 BufferedReader.class.getName(), double[].class.getName());
236 }
237 }
238 }
239 /**
240 * <code>DataAdapter</code> for data provided through some input stream
241 */
242 private class StreamDataAdapter extends DataAdapter{
243
244 /** Input stream providng access to the data */
245 private BufferedReader inputStream;
246
247 /**
248 * Create a StreamDataAdapter from a BufferedReader
249 *
250 * @param in BufferedReader input stream
251 */
252 public StreamDataAdapter(BufferedReader in){
253 super();
254 inputStream = in;
255 }
256 /**
257 * Computes binStats
258 *
259 * @param min minimum value
260 * @param delta grid size
261 * @throws IOException if an IO error occurs
262 */
263 @Override
264 public void computeBinStats(double min, double delta)
265 throws IOException {
266 String str = null;
267 double val = 0.0d;
268 while ((str = inputStream.readLine()) != null) {
269 val = Double.parseDouble(str);
270 SummaryStatistics stats = binStats.get(findBin(min, val, delta));
271 stats.addValue(val);
272 }
273
274 inputStream.close();
275 inputStream = null;
276 }
277 /**
278 * Computes sampleStats
279 *
280 * @throws IOException if an IOError occurs
281 */
282 @Override
283 public void computeStats() throws IOException {
284 String str = null;
285 double val = 0.0;
286 sampleStats = new SummaryStatistics();
287 while ((str = inputStream.readLine()) != null) {
288 val = Double.valueOf(str).doubleValue();
289 sampleStats.addValue(val);
290 }
291 inputStream.close();
292 inputStream = null;
293 }
294 }
295
296 /**
297 * <code>DataAdapter</code> for data provided as array of doubles.
298 */
299 private class ArrayDataAdapter extends DataAdapter{
300
301 /** Array of input data values */
302 private double[] inputArray;
303
304 /**
305 * Construct an ArrayDataAdapter from a double[] array
306 *
307 * @param in double[] array holding the data
308 */
309 public ArrayDataAdapter(double[] in){
310 super();
311 inputArray = in;
312 }
313 /**
314 * Computes sampleStats
315 *
316 * @throws IOException if an IO error occurs
317 */
318 @Override
319 public void computeStats() throws IOException {
320 sampleStats = new SummaryStatistics();
321 for (int i = 0; i < inputArray.length; i++) {
322 sampleStats.addValue(inputArray[i]);
323 }
324 }
325 /**
326 * Computes binStats
327 *
328 * @param min minimum value
329 * @param delta grid size
330 * @throws IOException if an IO error occurs
331 */
332 @Override
333 public void computeBinStats(double min, double delta)
334 throws IOException {
335 for (int i = 0; i < inputArray.length; i++) {
336 SummaryStatistics stats =
337 binStats.get(findBin(min, inputArray[i], delta));
338 stats.addValue(inputArray[i]);
339 }
340 }
341 }
342
343 /**
344 * Fills binStats array (second pass through data file).
345 *
346 * @param in object providing access to the data
347 * @throws IOException if an IO error occurs
348 */
349 private void fillBinStats(Object in) throws IOException {
350 // Load array of bin upper bounds -- evenly spaced from min - max
351 double min = sampleStats.getMin();
352 double max = sampleStats.getMax();
353 double delta = (max - min)/(Double.valueOf(binCount)).doubleValue();
354 double[] binUpperBounds = new double[binCount];
355 binUpperBounds[0] = min + delta;
356 for (int i = 1; i< binCount - 1; i++) {
357 binUpperBounds[i] = binUpperBounds[i-1] + delta;
358 }
359 binUpperBounds[binCount -1] = max;
360
361 // Initialize binStats ArrayList
362 if (!binStats.isEmpty()) {
363 binStats.clear();
364 }
365 for (int i = 0; i < binCount; i++) {
366 SummaryStatistics stats = new SummaryStatistics();
367 binStats.add(i,stats);
368 }
369
370 // Filling data in binStats Array
371 DataAdapterFactory aFactory = new DataAdapterFactory();
372 DataAdapter da = aFactory.getAdapter(in);
373 try {
374 da.computeBinStats(min, delta);
375 } catch (IOException ioe) {
376 // don't wrap exceptions which are already IOException
377 throw ioe;
378 } catch (RuntimeException rte) {
379 // don't wrap RuntimeExceptions
380 throw rte;
381 } catch (Exception e) {
382 throw MathRuntimeException.createIOException(e);
383 }
384
385 // Assign upperBounds based on bin counts
386 upperBounds = new double[binCount];
387 upperBounds[0] =
388 ((double) binStats.get(0).getN()) / (double) sampleStats.getN();
389 for (int i = 1; i < binCount-1; i++) {
390 upperBounds[i] = upperBounds[i-1] +
391 ((double) binStats.get(i).getN()) / (double) sampleStats.getN();
392 }
393 upperBounds[binCount-1] = 1.0d;
394 }
395
396 /**
397 * Returns the index of the bin to which the given value belongs
398 *
399 * @param min the minimum value
400 * @param value the value whose bin we are trying to find
401 * @param delta the grid size
402 * @return the index of the bin containing the value
403 */
404 private int findBin(double min, double value, double delta) {
405 return Math.min(
406 Math.max((int) Math.ceil((value- min) / delta) - 1, 0),
407 binCount - 1);
408 }
409
410 /**
411 * Generates a random value from this distribution.
412 *
413 * @return the random value.
414 * @throws IllegalStateException if the distribution has not been loaded
415 */
416 public double getNextValue() throws IllegalStateException {
417
418 if (!loaded) {
419 throw MathRuntimeException.createIllegalStateException("distribution not loaded");
420 }
421
422 // Start with a uniformly distributed random number in (0,1)
423 double x = Math.random();
424
425 // Use this to select the bin and generate a Gaussian within the bin
426 for (int i = 0; i < binCount; i++) {
427 if (x <= upperBounds[i]) {
428 SummaryStatistics stats = binStats.get(i);
429 if (stats.getN() > 0) {
430 if (stats.getStandardDeviation() > 0) { // more than one obs
431 return randomData.nextGaussian
432 (stats.getMean(),stats.getStandardDeviation());
433 } else {
434 return stats.getMean(); // only one obs in bin
435 }
436 }
437 }
438 }
439 throw new MathRuntimeException("no bin selected");
440 }
441
442 /**
443 * Returns a {@link StatisticalSummary} describing this distribution.
444 * <strong>Preconditions:</strong><ul>
445 * <li>the distribution must be loaded before invoking this method</li></ul>
446 *
447 * @return the sample statistics
448 * @throws IllegalStateException if the distribution has not been loaded
449 */
450 public StatisticalSummary getSampleStats() {
451 return sampleStats;
452 }
453
454 /**
455 * Returns the number of bins.
456 *
457 * @return the number of bins.
458 */
459 public int getBinCount() {
460 return binCount;
461 }
462
463 /**
464 * Returns a List of {@link SummaryStatistics} instances containing
465 * statistics describing the values in each of the bins. The list is
466 * indexed on the bin number.
467 *
468 * @return List of bin statistics.
469 */
470 public List<SummaryStatistics> getBinStats() {
471 return binStats;
472 }
473
474 /**
475 * Returns (a fresh copy of) the array of upper bounds for the bins.
476 Bins are: <br/>
477 * [min,upperBounds[0]],(upperBounds[0],upperBounds[1]],...,
478 * (upperBounds[binCount-1],max]
479 *
480 * @return array of bin upper bounds
481 */
482 public double[] getUpperBounds() {
483 int len = upperBounds.length;
484 double[] out = new double[len];
485 System.arraycopy(upperBounds, 0, out, 0, len);
486 return out;
487 }
488
489 /**
490 * Property indicating whether or not the distribution has been loaded.
491 *
492 * @return true if the distribution has been loaded
493 */
494 public boolean isLoaded() {
495 return loaded;
496 }
497 }