OpenWalnut  1.4.0
WValueSetHistogram.h
1 //---------------------------------------------------------------------------
2 //
3 // Project: OpenWalnut ( http://www.openwalnut.org )
4 //
5 // Copyright 2009 OpenWalnut Community, BSV@Uni-Leipzig and CNCF@MPI-CBS
6 // For more information see http://www.openwalnut.org/copying
7 //
8 // This file is part of OpenWalnut.
9 //
10 // OpenWalnut is free software: you can redistribute it and/or modify
11 // it under the terms of the GNU Lesser General Public License as published by
12 // the Free Software Foundation, either version 3 of the License, or
13 // (at your option) any later version.
14 //
15 // OpenWalnut is distributed in the hope that it will be useful,
16 // but WITHOUT ANY WARRANTY; without even the implied warranty of
17 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 // GNU Lesser General Public License for more details.
19 //
20 // You should have received a copy of the GNU Lesser General Public License
21 // along with OpenWalnut. If not, see <http://www.gnu.org/licenses/>.
22 //
23 //---------------------------------------------------------------------------
24 
25 #ifndef WVALUESETHISTOGRAM_H
26 #define WVALUESETHISTOGRAM_H
27 
28 #include <map>
29 #include <list>
30 #include <utility>
31 
32 #ifndef Q_MOC_RUN
33 #include <boost/shared_ptr.hpp>
34 #endif
35 #ifndef Q_MOC_RUN
36 #include <boost/scoped_array.hpp>
37 #endif
38 #ifndef Q_MOC_RUN
39 #include <boost/shared_array.hpp>
40 #endif
41 
42 #include "../../common/WHistogram.h"
43 #include "../WValueSet.h"
44 
45 
46 /**
47  * Used to find the occurrence frequencies of values in a value set. It implements a classical histogram but allows easy modification of bucket
48  * sizes without unnecessary recalculation of the whole histogram. This histogram uses right-open intervals for counting, which is why there
49  * always is a bucket at the end from max to infinity which holds all the max values.
50  *
51  * \note This histogram is different from from WValueSetHistogram which is a generic histogram class.
52  */
53 class WValueSetHistogram: public WHistogram // NOLINT
54 {
55 friend class WValueSetHistogramTest;
56 public:
57  /**
58  * Constructor. Creates the histogram for the specified value set.
59  *
60  * \param valueSet source of the data for the histogram
61  * \param buckets the number of buckets to use. If not specified, 1000 is used as default. Must be larger than 1.
62  */
63  explicit WValueSetHistogram( boost::shared_ptr< WValueSetBase > valueSet, size_t buckets = 1000 );
64 
65  /**
66  * Constructor. Creates the histogram for the specified value set.
67  *
68  * \param valueSet source of the data for the histogram
69  * \param buckets the number of buckets to use. If not specified, 1000 is used as default. Must be larger than 1.
70  */
71  explicit WValueSetHistogram( const WValueSetBase& valueSet, size_t buckets = 1000 );
72 
73  /**
74  * Constructor. Creates a histogram from the specified value set but allows cropping of values below the given min and above the given max.
75  * It actually interprets all values below min and above max to be exactly min and exactly max and sorts them into the appropriate bin. This
76  * is especially useful to filter out outliers in data.
77  *
78  * \param valueSet source data
79  * \param min the new minimum to use
80  * \param max the maximum to use
81  * \param buckets the number of buckets to use. If not specified, 1000 is used as default. Must be larger than 1.
82  */
83  WValueSetHistogram( boost::shared_ptr< WValueSetBase > valueSet, double min, double max, size_t buckets = 1000 );
84 
85  /**
86  * Constructor. Creates a histogram from the specified value set but allows cropping of values below the given min and above the given max.
87  * It actually interprets all values below min and above max to be exactly min and exactly max and sorts them into the appropriate bin. This
88  * is especially useful to filter out outliers in data.
89  *
90  * \param valueSet source data
91  * \param min the new minimum to use
92  * \param max the maximum to use
93  * \param buckets the number of buckets to use. If not specified, 1000 is used as default. Must be larger than 1.
94  */
95  WValueSetHistogram( const WValueSetBase& valueSet, double min, double max, size_t buckets = 1000 );
96 
97  /**
98  * Copy constructor. If another interval size is given the histogram gets matched to it using the initial bucket data.
99  * \note this does not deep copy the m_initialBuckets and m_mappedBuckets array as these are shared_array instances.
100  *
101  * \param histogram another WValueSetHistogram
102  * \param buckets the new number of buckets. Must be larger than 1 if specified.
103  */
104  WValueSetHistogram( const WValueSetHistogram& histogram, size_t buckets = 0 );
105 
106  /**
107  * Destructor.
108  */
110 
111  /**
112  * Copy assignment. Copies the contents of the specified histogram to this instance.
113  *
114  * \param other the other instance
115  *
116  * \return this instance with the contents of the other one.
117  * \note this does not deep copy the m_initialBuckets and m_mappedBuckets array as these are shared_array instances.
118  */
120 
121  /**
122  * Get the count of the bucket.
123  *
124  * \param index which buckets count is to be returned; starts with 0 which is the bucket containing the smallest values.
125  *
126  * \return elements in the bucket.
127  */
128  virtual size_t operator[]( size_t index ) const;
129 
130  /**
131  * Get the count of the bucket. Testing if the position is valid.
132  *
133  * \param index which buckets count is to be returned; starts with 0 which is the bucket containing the smallest values.
134  *
135  * \return elements in the bucket
136  */
137  virtual size_t at( size_t index ) const;
138 
139  /**
140  * Returns the number of buckets in the histogram with the actual mapping.
141  *
142  * \return number of buckets
143  */
144  virtual size_t size() const;
145 
146  /**
147  * Return the size of one bucket.
148  *
149  * \param index the width for this bucket is queried.
150  *
151  * \return the size of a bucket.
152  */
153  virtual double getBucketSize( size_t index = 0 ) const;
154 
155  /**
156  * Returns the actual interval associated with the given index. The interval is open, meaning that
157  * getIntervalForIndex( i ).second == getIntervalForIndex( i + 1 ).first but does not belong anymore to the interval itself but every value
158  * smaller than getIntervalForIndex( i ).second.
159  *
160  * \param index the intex
161  *
162  * \return the open interval.
163  */
164  virtual std::pair< double, double > getIntervalForIndex( size_t index ) const;
165 
166  /**
167  * Returns the right index to the bucket containing the given value. If a value larger than the maximum, the maximum index is returned. Same
168  * for minimum; if the value is smaller than the minimum, 0 is returned.
169  *
170  * \param value the value to search the index for
171  *
172  * \return the index of the bucket
173  */
174  virtual size_t getIndexForValue( double value ) const;
175 
176  /**
177  * This returns the number of value set entries added to the histogram. This is especially useful to normalize the histogram counts.
178  *
179  * \return the number of elements distributed in the buckets.
180  */
181  virtual size_t getTotalElementCount() const;
182 
183  /**
184  * Sums up the buckets in the specified interval. Especially useful for cumulative distribution functions or similar.
185  *
186  * \param startIndex the index where to start counting including this one
187  * \param endIndex the index where to end summing up excluding this one.
188  *
189  * \return the sum of all buckets in the interval.
190  * \throw WOutOfBounds if one of the indices is invalid.
191  */
192  virtual size_t accumulate( size_t startIndex, size_t endIndex ) const;
193 
194 protected:
195  /**
196  * Return the initial buckets.
197  *
198  * \return m_initialBuckets
199  */
200  boost::shared_array< size_t > getInitialBuckets() const;
201 
202  /**
203  * Return the number of initial buckets.
204  *
205  * \return m_nInitialBuckets
206  */
207  size_t getNInitialBuckets() const;
208 
209  /**
210  * Return the size of one initial bucket.
211  *
212  * \return m_bucketSize
213  */
214  double getInitialBucketSize() const;
215 
216  /**
217  * increment the value by one, contains the logic to find the element place in the array.
218  * Should only be used in the constructor i.e. while iterating over WValueSet.
219  *
220  * \param value value to increment
221  */
222  virtual void insert( double value );
223 
224 private:
225  /**
226  * Size of one bucket in the initial histogram.
227  */
229 
230  /**
231  * Pointer to all initial buckets of the histogram.
232  */
233  boost::shared_array< size_t > m_initialBuckets;
234 
235  /**
236  * Number of buckets in the initial histogram.
237  */
239 
240  /**
241  * Pointer to all initial buckets of the histogram.
242  */
243  boost::shared_array< size_t > m_mappedBuckets;
244 
245  /**
246  * Tracks the number of a buckets in the mapped histogram.
247  */
249 
250  /**
251  * Size of one bucket in the mapped histogram.
252  */
254 
255  /**
256  * The number of elements distributed in the buckets.
257  */
259 
260  /**
261  * Actually builds the histogram. This function is simply used for avoiding code duplication in all these constructors.
262  *
263  * \param valueSet the value set.
264  */
265  void buildHistogram( const WValueSetBase& valueSet );
266 };
267 
268 /**
269  * Write a histogram in string representation to the given output stream.
270  */
271 std::ostream& operator<<( std::ostream& out, const WValueSetHistogram& h );
272 
273 inline size_t WValueSetHistogram::getIndexForValue( double value ) const
274 {
275  // the position on the scala
276  double pos = ( value - m_minimum ) / static_cast< double >( m_mappedBucketSize );
277  // the index is the floor( position )
278  size_t idx = static_cast< size_t >( std::floor( pos ) );
279 
280  // is the index larger than the size?
281  bool inU = ( idx < m_nMappedBuckets );
282  // is the index smaller than the size?
283  bool inL = ( pos >= 0.0 );
284 
285  // the trick done here is to clamp value into [m_minimum,m_maximum] without using if statements. The C++ Standard says that booleans are
286  // always 1 if true.
287  // NOTE: this is integral arithmetic
288  return ( inL && inU ) * idx + ( !inU && inL ) * ( m_nMappedBuckets - 1 );
289 }
290 
291 #endif // WVALUESETHISTOGRAM_H
292 
Used to find the occurrence frequencies of values in a value set.
virtual size_t getTotalElementCount() const
This returns the number of value set entries added to the histogram.
size_t m_nInitialBuckets
Number of buckets in the initial histogram.
size_t getNInitialBuckets() const
Return the number of initial buckets.
~WValueSetHistogram()
Destructor.
virtual std::pair< double, double > getIntervalForIndex(size_t index) const
Returns the actual interval associated with the given index.
double m_mappedBucketSize
Size of one bucket in the mapped histogram.
double m_minimum
The smallest value.
Definition: WHistogram.h:123
Test WValueSetHistogram.
virtual size_t size() const
Returns the number of buckets in the histogram with the actual mapping.
WValueSetHistogram & operator=(const WValueSetHistogram &other)
Copy assignment.
boost::shared_array< size_t > m_initialBuckets
Pointer to all initial buckets of the histogram.
Container which associate values with (uniform width) bins (aka intervals or buckets).
Definition: WHistogram.h:36
size_t m_nMappedBuckets
Tracks the number of a buckets in the mapped histogram.
virtual void insert(double value)
increment the value by one, contains the logic to find the element place in the array.
double m_initialBucketSize
Size of one bucket in the initial histogram.
double getInitialBucketSize() const
Return the size of one initial bucket.
virtual size_t operator[](size_t index) const
Get the count of the bucket.
boost::shared_array< size_t > m_mappedBuckets
Pointer to all initial buckets of the histogram.
WValueSetHistogram(boost::shared_ptr< WValueSetBase > valueSet, size_t buckets=1000)
Constructor.
void buildHistogram(const WValueSetBase &valueSet)
Actually builds the histogram.
virtual double getBucketSize(size_t index=0) const
Return the size of one bucket.
size_t m_nbTotalElements
The number of elements distributed in the buckets.
Abstract base class to all WValueSets.
Definition: WValueSetBase.h:63
virtual size_t at(size_t index) const
Get the count of the bucket.
virtual size_t getIndexForValue(double value) const
Returns the right index to the bucket containing the given value.
boost::shared_array< size_t > getInitialBuckets() const
Return the initial buckets.
virtual size_t accumulate(size_t startIndex, size_t endIndex) const
Sums up the buckets in the specified interval.