001    /*
002     * Licensed to the Apache Software Foundation (ASF) under one or more
003     * contributor license agreements.  See the NOTICE file distributed with
004     * this work for additional information regarding copyright ownership.
005     * The ASF licenses this file to You under the Apache License, Version 2.0
006     * (the "License"); you may not use this file except in compliance with
007     * the License.  You may obtain a copy of the License at
008     *
009     *      http://www.apache.org/licenses/LICENSE-2.0
010     *
011     * Unless required by applicable law or agreed to in writing, software
012     * distributed under the License is distributed on an "AS IS" BASIS,
013     * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014     * See the License for the specific language governing permissions and
015     * limitations under the License.
016     */
017    package org.apache.commons.math.stat.descriptive;
018    
019    import java.io.Serializable;
020    import java.util.Arrays;
021    
022    import org.apache.commons.math.DimensionMismatchException;
023    import org.apache.commons.math.MathRuntimeException;
024    import org.apache.commons.math.linear.RealMatrix;
025    import org.apache.commons.math.stat.descriptive.moment.GeometricMean;
026    import org.apache.commons.math.stat.descriptive.moment.Mean;
027    import org.apache.commons.math.stat.descriptive.moment.VectorialCovariance;
028    import org.apache.commons.math.stat.descriptive.rank.Max;
029    import org.apache.commons.math.stat.descriptive.rank.Min;
030    import org.apache.commons.math.stat.descriptive.summary.Sum;
031    import org.apache.commons.math.stat.descriptive.summary.SumOfLogs;
032    import org.apache.commons.math.stat.descriptive.summary.SumOfSquares;
033    import org.apache.commons.math.util.MathUtils;
034    
035    /**
036     * <p>Computes summary statistics for a stream of n-tuples added using the
037     * {@link #addValue(double[]) addValue} method. The data values are not stored
038     * in memory, so this class can be used to compute statistics for very large
039     * n-tuple streams.</p>
040     *
041     * <p>The {@link StorelessUnivariateStatistic} instances used to maintain
042     * summary state and compute statistics are configurable via setters.
043     * For example, the default implementation for the mean can be overridden by
044     * calling {@link #setMeanImpl(StorelessUnivariateStatistic[])}. Actual
045     * parameters to these methods must implement the
046     * {@link StorelessUnivariateStatistic} interface and configuration must be
047     * completed before <code>addValue</code> is called. No configuration is
048     * necessary to use the default, commons-math provided implementations.</p>
049     *
050     * <p>To compute statistics for a stream of n-tuples, construct a
051     * MultivariateStatistics instance with dimension n and then use
052     * {@link #addValue(double[])} to add n-tuples. The <code>getXxx</code>
053     * methods where Xxx is a statistic return an array of <code>double</code>
054     * values, where for <code>i = 0,...,n-1</code> the i<sup>th</sup> array element is the
055     * value of the given statistic for data range consisting of the i<sup>th</sup> element of
056     * each of the input n-tuples.  For example, if <code>addValue</code> is called
057     * with actual parameters {0, 1, 2}, then {3, 4, 5} and finally {6, 7, 8},
058     * <code>getSum</code> will return a three-element array with values
059     * {0+3+6, 1+4+7, 2+5+8}</p>
060     *
061     * <p>Note: This class is not thread-safe. Use
062     * {@link SynchronizedMultivariateSummaryStatistics} if concurrent access from multiple
063     * threads is required.</p>
064     *
065     * @since 1.2
066     * @version $Revision: 811833 $ $Date: 2009-09-06 12:27:50 -0400 (Sun, 06 Sep 2009) $
067     */
068    public class MultivariateSummaryStatistics
069      implements StatisticalMultivariateSummary, Serializable {
070    
071        /** Serialization UID */
072        private static final long serialVersionUID = 2271900808994826718L;
073    
074        /** Dimension of the data. */
075        private int k;
076    
077        /** Count of values that have been added */
078        private long n = 0;
079    
080        /** Sum statistic implementation - can be reset by setter. */
081        private StorelessUnivariateStatistic[] sumImpl;
082    
083        /** Sum of squares statistic implementation - can be reset by setter. */
084        private StorelessUnivariateStatistic[] sumSqImpl;
085    
086        /** Minimum statistic implementation - can be reset by setter. */
087        private StorelessUnivariateStatistic[] minImpl;
088    
089        /** Maximum statistic implementation - can be reset by setter. */
090        private StorelessUnivariateStatistic[] maxImpl;
091    
092        /** Sum of log statistic implementation - can be reset by setter. */
093        private StorelessUnivariateStatistic[] sumLogImpl;
094    
095        /** Geometric mean statistic implementation - can be reset by setter. */
096        private StorelessUnivariateStatistic[] geoMeanImpl;
097    
098        /** Mean statistic implementation - can be reset by setter. */
099        private StorelessUnivariateStatistic[] meanImpl;
100    
101        /** Covariance statistic implementation - cannot be reset. */
102        private VectorialCovariance covarianceImpl;
103    
104        /**
105         * Construct a MultivariateSummaryStatistics instance
106         * @param k dimension of the data
107         * @param isCovarianceBiasCorrected if true, the unbiased sample
108         * covariance is computed, otherwise the biased population covariance
109         * is computed
110         */
111        public MultivariateSummaryStatistics(int k, boolean isCovarianceBiasCorrected) {
112            this.k = k;
113    
114            sumImpl     = new StorelessUnivariateStatistic[k];
115            sumSqImpl   = new StorelessUnivariateStatistic[k];
116            minImpl     = new StorelessUnivariateStatistic[k];
117            maxImpl     = new StorelessUnivariateStatistic[k];
118            sumLogImpl  = new StorelessUnivariateStatistic[k];
119            geoMeanImpl = new StorelessUnivariateStatistic[k];
120            meanImpl    = new StorelessUnivariateStatistic[k];
121    
122            for (int i = 0; i < k; ++i) {
123                sumImpl[i]     = new Sum();
124                sumSqImpl[i]   = new SumOfSquares();
125                minImpl[i]     = new Min();
126                maxImpl[i]     = new Max();
127                sumLogImpl[i]  = new SumOfLogs();
128                geoMeanImpl[i] = new GeometricMean();
129                meanImpl[i]    = new Mean();
130            }
131    
132            covarianceImpl =
133                new VectorialCovariance(k, isCovarianceBiasCorrected);
134    
135        }
136    
137        /**
138         * Add an n-tuple to the data
139         *
140         * @param value  the n-tuple to add
141         * @throws DimensionMismatchException if the length of the array
142         * does not match the one used at construction
143         */
144        public void addValue(double[] value)
145          throws DimensionMismatchException {
146            checkDimension(value.length);
147            for (int i = 0; i < k; ++i) {
148                double v = value[i];
149                sumImpl[i].increment(v);
150                sumSqImpl[i].increment(v);
151                minImpl[i].increment(v);
152                maxImpl[i].increment(v);
153                sumLogImpl[i].increment(v);
154                geoMeanImpl[i].increment(v);
155                meanImpl[i].increment(v);
156            }
157            covarianceImpl.increment(value);
158            n++;
159        }
160    
161        /**
162         * Returns the dimension of the data
163         * @return The dimension of the data
164         */
165        public int getDimension() {
166            return k;
167        }
168    
169        /**
170         * Returns the number of available values
171         * @return The number of available values
172         */
173        public long getN() {
174            return n;
175        }
176    
177        /**
178         * Returns an array of the results of a statistic.
179         * @param stats univariate statistic array
180         * @return results array
181         */
182        private double[] getResults(StorelessUnivariateStatistic[] stats) {
183            double[] results = new double[stats.length];
184            for (int i = 0; i < results.length; ++i) {
185                results[i] = stats[i].getResult();
186            }
187            return results;
188        }
189    
190        /**
191         * Returns an array whose i<sup>th</sup> entry is the sum of the
192         * i<sup>th</sup> entries of the arrays that have been added using
193         * {@link #addValue(double[])}
194         *
195         * @return the array of component sums
196         */
197        public double[] getSum() {
198            return getResults(sumImpl);
199        }
200    
201        /**
202         * Returns an array whose i<sup>th</sup> entry is the sum of squares of the
203         * i<sup>th</sup> entries of the arrays that have been added using
204         * {@link #addValue(double[])}
205         *
206         * @return the array of component sums of squares
207         */
208        public double[] getSumSq() {
209            return getResults(sumSqImpl);
210        }
211    
212        /**
213         * Returns an array whose i<sup>th</sup> entry is the sum of logs of the
214         * i<sup>th</sup> entries of the arrays that have been added using
215         * {@link #addValue(double[])}
216         *
217         * @return the array of component log sums
218         */
219        public double[] getSumLog() {
220            return getResults(sumLogImpl);
221        }
222    
223        /**
224         * Returns an array whose i<sup>th</sup> entry is the mean of the
225         * i<sup>th</sup> entries of the arrays that have been added using
226         * {@link #addValue(double[])}
227         *
228         * @return the array of component means
229         */
230        public double[] getMean() {
231            return getResults(meanImpl);
232        }
233    
234        /**
235         * Returns an array whose i<sup>th</sup> entry is the standard deviation of the
236         * i<sup>th</sup> entries of the arrays that have been added using
237         * {@link #addValue(double[])}
238         *
239         * @return the array of component standard deviations
240         */
241        public double[] getStandardDeviation() {
242            double[] stdDev = new double[k];
243            if (getN() < 1) {
244                Arrays.fill(stdDev, Double.NaN);
245            } else if (getN() < 2) {
246                Arrays.fill(stdDev, 0.0);
247            } else {
248                RealMatrix matrix = covarianceImpl.getResult();
249                for (int i = 0; i < k; ++i) {
250                    stdDev[i] = Math.sqrt(matrix.getEntry(i, i));
251                }
252            }
253            return stdDev;
254        }
255    
256        /**
257         * Returns the covariance matrix of the values that have been added.
258         *
259         * @return the covariance matrix
260         */
261        public RealMatrix getCovariance() {
262            return covarianceImpl.getResult();
263        }
264    
265        /**
266         * Returns an array whose i<sup>th</sup> entry is the maximum of the
267         * i<sup>th</sup> entries of the arrays that have been added using
268         * {@link #addValue(double[])}
269         *
270         * @return the array of component maxima
271         */
272        public double[] getMax() {
273            return getResults(maxImpl);
274        }
275    
276        /**
277         * Returns an array whose i<sup>th</sup> entry is the minimum of the
278         * i<sup>th</sup> entries of the arrays that have been added using
279         * {@link #addValue(double[])}
280         *
281         * @return the array of component minima
282         */
283        public double[] getMin() {
284            return getResults(minImpl);
285        }
286    
287        /**
288         * Returns an array whose i<sup>th</sup> entry is the geometric mean of the
289         * i<sup>th</sup> entries of the arrays that have been added using
290         * {@link #addValue(double[])}
291         *
292         * @return the array of component geometric means
293         */
294        public double[] getGeometricMean() {
295            return getResults(geoMeanImpl);
296        }
297    
298        /**
299         * Generates a text report displaying
300         * summary statistics from values that
301         * have been added.
302         * @return String with line feeds displaying statistics
303         */
304        @Override
305        public String toString() {
306            StringBuffer outBuffer = new StringBuffer();
307            outBuffer.append("MultivariateSummaryStatistics:\n");
308            outBuffer.append("n: " + getN() + "\n");
309            append(outBuffer, getMin(), "min: ", ", ", "\n");
310            append(outBuffer, getMax(), "max: ", ", ", "\n");
311            append(outBuffer, getMean(), "mean: ", ", ", "\n");
312            append(outBuffer, getGeometricMean(), "geometric mean: ", ", ", "\n");
313            append(outBuffer, getSumSq(), "sum of squares: ", ", ", "\n");
314            append(outBuffer, getSumLog(), "sum of logarithms: ", ", ", "\n");
315            append(outBuffer, getStandardDeviation(), "standard deviation: ", ", ", "\n");
316            outBuffer.append("covariance: " + getCovariance().toString() + "\n");
317            return outBuffer.toString();
318        }
319    
320        /**
321         * Append a text representation of an array to a buffer.
322         * @param buffer buffer to fill
323         * @param data data array
324         * @param prefix text prefix
325         * @param separator elements separator
326         * @param suffix text suffix
327         */
328        private void append(StringBuffer buffer, double[] data,
329                            String prefix, String separator, String suffix) {
330            buffer.append(prefix);
331            for (int i = 0; i < data.length; ++i) {
332                if (i > 0) {
333                    buffer.append(separator);
334                }
335                buffer.append(data[i]);
336            }
337            buffer.append(suffix);
338        }
339    
340        /**
341         * Resets all statistics and storage
342         */
343        public void clear() {
344            this.n = 0;
345            for (int i = 0; i < k; ++i) {
346                minImpl[i].clear();
347                maxImpl[i].clear();
348                sumImpl[i].clear();
349                sumLogImpl[i].clear();
350                sumSqImpl[i].clear();
351                geoMeanImpl[i].clear();
352                meanImpl[i].clear();
353            }
354            covarianceImpl.clear();
355        }
356    
357        /**
358         * Returns true iff <code>object</code> is a <code>SummaryStatistics</code>
359         * instance and all statistics have the same values as this.
360         * @param object the object to test equality against.
361         * @return true if object equals this
362         */
363        @Override
364        public boolean equals(Object object) {
365            if (object == this ) {
366                return true;
367            }
368            if (object instanceof MultivariateSummaryStatistics == false) {
369                return false;
370            }
371            MultivariateSummaryStatistics stat = (MultivariateSummaryStatistics) object;
372            return MathUtils.equals(stat.getGeometricMean(), getGeometricMean()) &&
373                   MathUtils.equals(stat.getMax(),           getMax())           &&
374                   MathUtils.equals(stat.getMean(),          getMean())          &&
375                   MathUtils.equals(stat.getMin(),           getMin())           &&
376                   MathUtils.equals(stat.getN(),             getN())             &&
377                   MathUtils.equals(stat.getSum(),           getSum())           &&
378                   MathUtils.equals(stat.getSumSq(),         getSumSq())         &&
379                   MathUtils.equals(stat.getSumLog(),        getSumLog())        &&
380                   stat.getCovariance().equals( getCovariance());
381        }
382    
383        /**
384         * Returns hash code based on values of statistics
385         *
386         * @return hash code
387         */
388        @Override
389        public int hashCode() {
390            int result = 31 + MathUtils.hash(getGeometricMean());
391            result = result * 31 + MathUtils.hash(getGeometricMean());
392            result = result * 31 + MathUtils.hash(getMax());
393            result = result * 31 + MathUtils.hash(getMean());
394            result = result * 31 + MathUtils.hash(getMin());
395            result = result * 31 + MathUtils.hash(getN());
396            result = result * 31 + MathUtils.hash(getSum());
397            result = result * 31 + MathUtils.hash(getSumSq());
398            result = result * 31 + MathUtils.hash(getSumLog());
399            result = result * 31 + getCovariance().hashCode();
400            return result;
401        }
402    
403        // Getters and setters for statistics implementations
404        /**
405         * Sets statistics implementations.
406         * @param newImpl new implementations for statistics
407         * @param oldImpl old implementations for statistics
408         * @throws DimensionMismatchException if the array dimension
409         * does not match the one used at construction
410         * @throws IllegalStateException if data has already been added
411         *  (i.e if n > 0)
412         */
413        private void setImpl(StorelessUnivariateStatistic[] newImpl,
414                             StorelessUnivariateStatistic[] oldImpl)
415           throws DimensionMismatchException, IllegalStateException {
416            checkEmpty();
417            checkDimension(newImpl.length);
418            System.arraycopy(newImpl, 0, oldImpl, 0, newImpl.length);
419        }
420    
421        /**
422         * Returns the currently configured Sum implementation
423         *
424         * @return the StorelessUnivariateStatistic implementing the sum
425         */
426        public StorelessUnivariateStatistic[] getSumImpl() {
427            return sumImpl.clone();
428        }
429    
430        /**
431         * <p>Sets the implementation for the Sum.</p>
432         * <p>This method must be activated before any data has been added - i.e.,
433         * before {@link #addValue(double[]) addValue} has been used to add data;
434         * otherwise an IllegalStateException will be thrown.</p>
435         *
436         * @param sumImpl the StorelessUnivariateStatistic instance to use
437         * for computing the Sum
438         * @throws DimensionMismatchException if the array dimension
439         * does not match the one used at construction
440         * @throws IllegalStateException if data has already been added
441         *  (i.e if n > 0)
442         */
443        public void setSumImpl(StorelessUnivariateStatistic[] sumImpl)
444          throws DimensionMismatchException {
445            setImpl(sumImpl, this.sumImpl);
446        }
447    
448        /**
449         * Returns the currently configured sum of squares implementation
450         *
451         * @return the StorelessUnivariateStatistic implementing the sum of squares
452         */
453        public StorelessUnivariateStatistic[] getSumsqImpl() {
454            return sumSqImpl.clone();
455        }
456    
457        /**
458         * <p>Sets the implementation for the sum of squares.</p>
459         * <p>This method must be activated before any data has been added - i.e.,
460         * before {@link #addValue(double[]) addValue} has been used to add data;
461         * otherwise an IllegalStateException will be thrown.</p>
462         *
463         * @param sumsqImpl the StorelessUnivariateStatistic instance to use
464         * for computing the sum of squares
465         * @throws DimensionMismatchException if the array dimension
466         * does not match the one used at construction
467         * @throws IllegalStateException if data has already been added
468         *  (i.e if n > 0)
469         */
470        public void setSumsqImpl(StorelessUnivariateStatistic[] sumsqImpl)
471          throws DimensionMismatchException {
472            setImpl(sumsqImpl, this.sumSqImpl);
473        }
474    
475        /**
476         * Returns the currently configured minimum implementation
477         *
478         * @return the StorelessUnivariateStatistic implementing the minimum
479         */
480        public StorelessUnivariateStatistic[] getMinImpl() {
481            return minImpl.clone();
482        }
483    
484        /**
485         * <p>Sets the implementation for the minimum.</p>
486         * <p>This method must be activated before any data has been added - i.e.,
487         * before {@link #addValue(double[]) addValue} has been used to add data;
488         * otherwise an IllegalStateException will be thrown.</p>
489         *
490         * @param minImpl the StorelessUnivariateStatistic instance to use
491         * for computing the minimum
492         * @throws DimensionMismatchException if the array dimension
493         * does not match the one used at construction
494         * @throws IllegalStateException if data has already been added
495         *  (i.e if n > 0)
496         */
497        public void setMinImpl(StorelessUnivariateStatistic[] minImpl)
498          throws DimensionMismatchException {
499            setImpl(minImpl, this.minImpl);
500        }
501    
502        /**
503         * Returns the currently configured maximum implementation
504         *
505         * @return the StorelessUnivariateStatistic implementing the maximum
506         */
507        public StorelessUnivariateStatistic[] getMaxImpl() {
508            return maxImpl.clone();
509        }
510    
511        /**
512         * <p>Sets the implementation for the maximum.</p>
513         * <p>This method must be activated before any data has been added - i.e.,
514         * before {@link #addValue(double[]) addValue} has been used to add data;
515         * otherwise an IllegalStateException will be thrown.</p>
516         *
517         * @param maxImpl the StorelessUnivariateStatistic instance to use
518         * for computing the maximum
519         * @throws DimensionMismatchException if the array dimension
520         * does not match the one used at construction
521         * @throws IllegalStateException if data has already been added
522         *  (i.e if n > 0)
523         */
524        public void setMaxImpl(StorelessUnivariateStatistic[] maxImpl)
525          throws DimensionMismatchException {
526            setImpl(maxImpl, this.maxImpl);
527        }
528    
529        /**
530         * Returns the currently configured sum of logs implementation
531         *
532         * @return the StorelessUnivariateStatistic implementing the log sum
533         */
534        public StorelessUnivariateStatistic[] getSumLogImpl() {
535            return sumLogImpl.clone();
536        }
537    
538        /**
539         * <p>Sets the implementation for the sum of logs.</p>
540         * <p>This method must be activated before any data has been added - i.e.,
541         * before {@link #addValue(double[]) addValue} has been used to add data;
542         * otherwise an IllegalStateException will be thrown.</p>
543         *
544         * @param sumLogImpl the StorelessUnivariateStatistic instance to use
545         * for computing the log sum
546         * @throws DimensionMismatchException if the array dimension
547         * does not match the one used at construction
548         * @throws IllegalStateException if data has already been added
549         *  (i.e if n > 0)
550         */
551        public void setSumLogImpl(StorelessUnivariateStatistic[] sumLogImpl)
552          throws DimensionMismatchException {
553            setImpl(sumLogImpl, this.sumLogImpl);
554        }
555    
556        /**
557         * Returns the currently configured geometric mean implementation
558         *
559         * @return the StorelessUnivariateStatistic implementing the geometric mean
560         */
561        public StorelessUnivariateStatistic[] getGeoMeanImpl() {
562            return geoMeanImpl.clone();
563        }
564    
565        /**
566         * <p>Sets the implementation for the geometric mean.</p>
567         * <p>This method must be activated before any data has been added - i.e.,
568         * before {@link #addValue(double[]) addValue} has been used to add data;
569         * otherwise an IllegalStateException will be thrown.</p>
570         *
571         * @param geoMeanImpl the StorelessUnivariateStatistic instance to use
572         * for computing the geometric mean
573         * @throws DimensionMismatchException if the array dimension
574         * does not match the one used at construction
575         * @throws IllegalStateException if data has already been added
576         *  (i.e if n > 0)
577         */
578        public void setGeoMeanImpl(StorelessUnivariateStatistic[] geoMeanImpl)
579          throws DimensionMismatchException {
580            setImpl(geoMeanImpl, this.geoMeanImpl);
581        }
582    
583        /**
584         * Returns the currently configured mean implementation
585         *
586         * @return the StorelessUnivariateStatistic implementing the mean
587         */
588        public StorelessUnivariateStatistic[] getMeanImpl() {
589            return meanImpl.clone();
590        }
591    
592        /**
593         * <p>Sets the implementation for the mean.</p>
594         * <p>This method must be activated before any data has been added - i.e.,
595         * before {@link #addValue(double[]) addValue} has been used to add data;
596         * otherwise an IllegalStateException will be thrown.</p>
597         *
598         * @param meanImpl the StorelessUnivariateStatistic instance to use
599         * for computing the mean
600         * @throws DimensionMismatchException if the array dimension
601         * does not match the one used at construction
602         * @throws IllegalStateException if data has already been added
603         *  (i.e if n > 0)
604         */
605        public void setMeanImpl(StorelessUnivariateStatistic[] meanImpl)
606          throws DimensionMismatchException {
607            setImpl(meanImpl, this.meanImpl);
608        }
609    
610        /**
611         * Throws IllegalStateException if n > 0.
612         */
613        private void checkEmpty() {
614            if (n > 0) {
615                throw MathRuntimeException.createIllegalStateException(
616                        "{0} values have been added before statistic is configured",
617                        n);
618            }
619        }
620    
621        /**
622         * Throws DimensionMismatchException if dimension != k.
623         * @param dimension dimension to check
624         * @throws DimensionMismatchException if dimension != k
625         */
626        private void checkDimension(int dimension)
627          throws DimensionMismatchException {
628            if (dimension != k) {
629                throw new DimensionMismatchException(dimension, k);
630            }
631        }
632    
633    }