SHOGUN  3.2.1
 全部  命名空间 文件 函数 变量 类型定义 枚举 枚举值 友元 宏定义 
FeatureSelection.cpp
浏览该文件的文档.
1 /*
2  * Copyright (c) The Shogun Machine Learning Toolbox
3  * Written (w) 2014 Soumyajit De
4  * All rights reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions are met:
8  *
9  * 1. Redistributions of source code must retain the above copyright notice, this
10  * list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright notice,
12  * this list of conditions and the following disclaimer in the documentation
13  * and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
16  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18  * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR
19  * ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
20  * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
21  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
22  * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
24  * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  * The views and conclusions contained in the software and documentation are those
27  * of the authors and should not be interpreted as representing official policies,
28  * either expressed or implied, of the Shogun Development Team.
29  */
30 
31 #include <shogun/labels/Labels.h>
37 
38 namespace shogun
39 {
40 
41 template <class ST>
43 {
44  init();
45 }
46 
47 template <class ST>
49 {
50  SG_ADD(&m_target_dim, "target_dim", "target dimension",
52  SG_ADD((machine_int_t*)&m_algorithm, "algorithm",
53  "the feature selectiona algorithm", MS_NOT_AVAILABLE);
54  SG_ADD((machine_int_t*)&m_policy, "policy", "feature removal policy",
56  SG_ADD(&m_num_remove, "num_remove", "number or percentage of features to "
57  "be removed", MS_NOT_AVAILABLE);
58  SG_ADD((CSGObject**)&m_labels, "labels",
59  "the class labels for the features", MS_NOT_AVAILABLE);
60  SG_ADD((CSGObject**)&m_subset, "subset",
61  "indices of selected features", MS_NOT_AVAILABLE);
62 
63  m_target_dim=0;
64  m_algorithm=BACKWARD_ELIMINATION;
65  m_policy=N_LARGEST;
66  m_num_remove=1;
67  m_labels=NULL;
68  m_subset=new CSubsetStack();
69 }
70 
71 template <class ST>
73 {
74  SG_UNREF(m_labels);
75  SG_UNREF(m_subset);
76 }
77 
78 
79 template <class ST>
81 {
82  m_subset->remove_all_subsets();
83 }
84 
85 template <class ST>
87 {
88  SG_DEBUG("Entering!\n");
89 
90  // precompute whenever appropriate for performing the rest of the tasks
91  precompute();
92 
93  // NULL check for features is handled in get_num_features
94  index_t num_features=get_num_features(features);
95  SG_DEBUG("Initial number of features %d!\n", num_features);
96 
97  // the main loop
98  while (num_features>m_target_dim)
99  {
100  // tune the measurement parameters whenever necessary based on current
101  // features
102  adapt_params(features);
103 
104  // compute the measures for each of the current dimensions
105  SGVector<float64_t> measures(num_features);
106  for (index_t i=0; i<num_features; ++i)
107  measures[i]=compute_measures(features, i);
108 
109  if (io->get_loglevel()==MSG_DEBUG || io->get_loglevel()==MSG_GCDEBUG)
110  measures.display_vector("measures");
111 
112  // rank the measures
113  SGVector<index_t> argsorted=measures.argsort();
114 
115  if (io->get_loglevel()==MSG_DEBUG || io->get_loglevel()==MSG_GCDEBUG)
116  argsorted.display_vector("argsorted");
117 
118  // make sure that we don't end up with lesser feats than target dim
119  index_t to_remove;
120  if (m_policy==N_SMALLEST || m_policy==N_LARGEST)
121  to_remove=m_num_remove;
122  else
123  to_remove=num_features*m_num_remove*0.01;
124 
125  index_t can_remove=num_features-m_target_dim;
126 
127  // if policy is to remove N feats corresponding to smallest/largest
128  // measures, we just replace N with can_remove. if policy is to remove
129  // N% feats, then we change the policy temporarily and remove a fixed
130  // can_remove number of feats instead
131  index_t orig_remove=m_num_remove;
132  EFeatureRemovalPolicy orig_policy=m_policy;
133 
134  if (to_remove>can_remove)
135  {
136  m_num_remove=can_remove;
137  SG_DEBUG("Can only remove %d features in this iteration!\n",
138  can_remove);
139 
140  if (m_policy==PERCENTILE_SMALLEST)
141  m_policy=N_SMALLEST;
142  else if (m_policy==PERCENTILE_LARGEST)
143  m_policy=N_LARGEST;
144  }
145 
146  // remove appropriate number of features based on the measures and the
147  // removal policy. this internally update the subset for selected
148  // features as well
149  features=remove_feats(features, argsorted);
150 
151  // restore original removal policy and numbers if necessary for the
152  // sake of consistency
153  if (to_remove>can_remove)
154  {
155  m_policy=orig_policy;
156  m_num_remove=orig_remove;
157  }
158 
159  // update the number of features
160  num_features=get_num_features(features);
161  SG_DEBUG("Current number of features %d!\n", num_features);
162  }
163 
164  // sanity check
165  ASSERT(m_subset->get_size()==m_target_dim);
166 
167  SG_DEBUG("Leaving!\n");
168  return features;
169 }
170 
171 template <class ST>
173 {
174  SG_DEBUG("Entering!\n");
175 
176  // remove previously computed feature subsets
177  m_subset->remove_all_subsets();
178 
179  // sanity checks
180  REQUIRE(features, "Features cannot be NULL!\n");
181  REQUIRE(features->get_num_vectors()>0,
182  "Number of feature vectors has to be positive!\n");
183  REQUIRE(m_target_dim>0, "Target dimension (%d) has to be positive! Set "
184  "a higher number via set_target_dim().\n", m_target_dim);
185 
186  index_t num_features=get_num_features(features);
187  REQUIRE(num_features>0, "Invalid number of features (%d)! Most likely "
188  "feature selection cannot be performed for %s!\n",
189  num_features, features->get_name());
190  REQUIRE(num_features>m_target_dim,
191  "Number of original features (dimensions of the feature vectors) "
192  "(%d) has to be greater that the target dimension (%d)!\n",
193  num_features, m_target_dim);
194 
195  // this method makes a deep copy of the feature object and performs
196  // feature selection on it. This is already SG_REF'ed because of the
197  // implementation of clone()
198  CFeatures* feats_copy=(CFeatures*)features->clone();
199 
200  switch (m_algorithm)
201  {
203  return apply_backward_elimination(feats_copy);
204  default:
205  SG_ERROR("Specified algorithm not yet supported!\n");
206  return features;
207  }
208 
209  SG_DEBUG("Leaving!\n");
210 }
211 
212 template <class ST>
214 {
215 }
216 
217 template <class ST>
219 {
220 }
221 
222 template <class ST>
224 {
225  ASSERT(m_subset);
226 
227  SGVector<index_t> inds;
228  if (m_subset->has_subsets())
229  {
230  inds=SGVector<index_t>(m_subset->get_size());
231  for (index_t i=0; i<inds.vlen; ++i)
232  inds[i]=m_subset->subset_idx_conversion(i);
233  inds.qsort();
234  }
235 
236  return inds;
237 }
238 
239 template <class ST>
241 {
242  REQUIRE(features, "Features not initialized!\n");
243 
244  EFeatureClass f_class=features->get_feature_class();
245 
246  switch (f_class)
247  {
248  case C_DENSE:
249  {
250  CDenseFeatures<ST>* d_feats=dynamic_cast<CDenseFeatures<ST>*>(features);
251  REQUIRE(d_feats, "Type mismatch for dense features!\n");
252  return d_feats->get_num_features();
253  }
254  case C_SPARSE:
255  {
256  CSparseFeatures<ST>* s_feats=dynamic_cast<CSparseFeatures<ST>*>(features);
257  REQUIRE(s_feats, "Type mismatch for sparse features!\n");
258  return s_feats->get_num_features();
259  }
260  default:
261  SG_ERROR("Number of features not available for %s!\n",
262  features->get_name());
263  break;
264  }
265 
266  return 0;
267 }
268 
269 template <class ST>
271 {
272  m_target_dim=target_dim;
273 }
274 
275 template <class ST>
277 {
278  return m_target_dim;
279 }
280 
281 template <class ST>
283 {
284  return m_algorithm;
285 }
286 
287 template <class ST>
289 {
290  return m_policy;
291 }
292 
293 template <class ST>
295 {
296  m_num_remove=num_remove;
297 }
298 
299 template <class ST>
301 {
302  return m_num_remove;
303 }
304 
305 template <class ST>
307 {
308  SG_REF(labels);
309  SG_UNREF(m_labels);
310  m_labels=labels;
311 }
312 
313 template <class ST>
315 {
316  SG_REF(m_labels);
317  return m_labels;
318 }
319 
320 template <class ST>
322 {
323  return C_ANY;
324 }
325 
326 template <class ST>
328 {
329  return P_UNKNOWN;
330 }
331 
332 template<>
334 {
335  return F_LONGREAL;
336 }
337 
338 template<>
340 {
341  return F_DREAL;
342 }
343 
344 template<>
346 {
347  return F_SHORTREAL;
348 }
349 
350 template<>
352 {
353  return F_SHORT;
354 }
355 
356 template<>
358 {
359  return F_WORD;
360 }
361 
362 template<>
364 {
365  return F_CHAR;
366 }
367 
368 template<>
370 {
371  return F_CHAR;
372 }
373 
374 template<>
376 {
377  return F_BYTE;
378 }
379 
380 template<>
382 {
383  return F_INT;
384 }
385 
386 template<>
388 {
389  return F_UINT;
390 }
391 
392 template<>
394 {
395  return F_LONG;
396 }
397 
398 template<>
400 {
401  return F_ULONG;
402 }
403 
404 template<>
406 {
407  return F_BOOL;
408 }
409 
410 template class CFeatureSelection<bool>;
411 template class CFeatureSelection<char>;
412 template class CFeatureSelection<int8_t>;
413 template class CFeatureSelection<uint8_t>;
414 template class CFeatureSelection<int16_t>;
415 template class CFeatureSelection<uint16_t>;
416 template class CFeatureSelection<int32_t>;
417 template class CFeatureSelection<uint32_t>;
418 template class CFeatureSelection<int64_t>;
419 template class CFeatureSelection<uint64_t>;
420 template class CFeatureSelection<float32_t>;
421 template class CFeatureSelection<float64_t>;
422 template class CFeatureSelection<floatmax_t>;
423 
424 }
virtual const char * get_name() const =0
virtual void adapt_params(CFeatures *features)
EPreprocessorType
Definition: Preprocessor.h:32
The class DenseFeatures implements dense feature matrices.
Definition: LDA.h:41
int32_t get_num_features() const
SGVector< index_t > get_selected_feats()
int32_t index_t
Definition: common.h:62
The class Labels models labels, i.e. class assignments of objects.
Definition: Labels.h:43
virtual CSGObject * clone()
Definition: SGObject.cpp:1360
Template class SparseFeatures implements sparse matrices.
virtual int32_t get_num_vectors() const =0
#define SG_ERROR(...)
Definition: SGIO.h:129
#define REQUIRE(x,...)
Definition: SGIO.h:206
virtual EPreprocessorType get_type() const
#define SG_REF(x)
Definition: SGObject.h:51
EFeatureClass
shogun feature class
Definition: FeatureTypes.h:38
class to add subset support to another class. A CSubsetStackStack instance should be added and wrappe...
Definition: SubsetStack.h:37
Template class CFeatureSelection, base class for all feature selection preprocessors which select a s...
void display_vector(const char *name="vector", const char *prefix="") const
Definition: SGVector.cpp:426
void set_num_remove(index_t num_remove)
EFeatureSelectionAlgorithm get_algorithm() const
SGVector< index_t > argsort()
Definition: SGVector.cpp:215
virtual void set_labels(CLabels *labels)
index_t vlen
Definition: SGVector.h:637
virtual CFeatures * apply_backward_elimination(CFeatures *features)
#define ASSERT(x)
Definition: SGIO.h:201
Class SGObject is the base class of all shogun objects.
Definition: SGObject.h:112
int32_t get_num_features() const
EFeatureRemovalPolicy get_policy() const
virtual EFeatureClass get_feature_class()
virtual EFeatureClass get_feature_class() const =0
EFeatureType
shogun feature type
Definition: FeatureTypes.h:19
#define SG_UNREF(x)
Definition: SGObject.h:52
#define SG_DEBUG(...)
Definition: SGIO.h:107
virtual EFeatureType get_feature_type()
all of classes and functions are contained in the shogun namespace
Definition: class_list.h:18
index_t get_num_features(CFeatures *features) const
int machine_int_t
Definition: common.h:59
The class Features is the base class of all feature objects.
Definition: Features.h:68
EFeatureSelectionAlgorithm
Class Preprocessor defines a preprocessor interface.
Definition: Preprocessor.h:75
void set_target_dim(index_t target_dim)
#define SG_ADD(...)
Definition: SGObject.h:81
virtual CFeatures * apply(CFeatures *features)
CLabels * get_labels() const

SHOGUN 机器学习工具包 - 项目文档