Last active
February 22, 2018 16:03
-
-
Save cmdelatorre/fd9ee43167f5cc1da130 to your computer and use it in GitHub Desktop.
Revisions
-
cmdelatorre revised this gist
Sep 2, 2014 . 1 changed file with 0 additions and 164 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -100,170 +100,6 @@ def select_best_feature(self, samples, features, klasses): provided samples (mimizes entropy in klasses). """ gain_factors = [(self.information_gain(samples, feat, klasses), feat) for feat in features] gain_factors.sort() -
cmdelatorre created this gist
Aug 13, 2014 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,326 @@ """ http://en.wikipedia.org/wiki/ID3_algorithm http://www.cise.ufl.edu/~ddd/cap6635/Fall-97/Short-papers/2.htm """ from collections import namedtuple, Counter, defaultdict from math import log2 UNCLASSIFIED_VALUE = -99 class ID3Classifier(object): def __init__(self): self.root_node = None def fit(self, samples, target): """Create the decision tree.""" training_samples = [TrainingSample(s, t) for s, t in zip(samples, target)] predicting_features = list(range(len(samples[0]))) self.root_node = self.create_decision_tree(training_samples, predicting_features) def predict(self, X, allow_unclassified=False): default_klass = 1 if allow_unclassified: default_klass = UNCLASSIFIED_VALUE predicted_klasses = [] for sample in X: klass = None current_node = self.root_node while klass is None: if current_node.is_leaf(): klass = current_node.klass else: key_value = sample[current_node.feature] if key_value in current_node: current_node = current_node[key_value] else: # UNCLASSIFIED_VALUE klass = default_klass predicted_klasses.append(klass) return predicted_klasses def score(self, X, target, allow_unclassified=True): predicted = self.predict(X, allow_unclassified=allow_unclassified) n_matches = sum(p == t for p, t in zip(predicted, target)) return 1.0 * n_matches / len(X) def create_decision_tree(self, training_samples, predicting_features): """Recursively, create a desition tree and return the parent node.""" if not predicting_features: # No more predicting features default_klass = self.get_most_common_class(training_samples) root_node = DecisionTreeLeaf(default_klass) else: klasses = [sample.klass for sample in training_samples] if len(set(klasses)) == 1: target_klass = training_samples[0].klass root_node = DecisionTreeLeaf(target_klass) else: best_feature = self.select_best_feature(training_samples, predicting_features, klasses) # Create the node to return and create the sub-tree. root_node = DecisionTreeNode(best_feature) best_feature_values = {s.sample[best_feature] for s in training_samples} for value in best_feature_values: samples = [s for s in training_samples if s.sample[best_feature] == value] # Recursively, create a child node. child = self.create_decision_tree(samples, predicting_features) root_node[value] = child return root_node @staticmethod def get_most_common_class(trainning_samples): """Return the most common class in the given samples.""" klasses = [s.klass for s in trainning_samples] counter = Counter(klasses) k, = counter.most_common(n=1) return k[0] def select_best_feature(self, samples, features, klasses): """ Select, remove from list and return the feature that best classifies the samples. The best feature is the one with higher information gain for the provided samples (mimizes entropy in klasses). """ # Implement Gain gain_factors = [(self.information_gain(samples, feat, klasses), feat) for feat in features] gain_factors.sort() best_feature = gain_factors[-1][1] features.pop(features.index(best_feature)) return best_feature def information_gain(self, samples, feature, klasses): """ Information gain is the measure of the difference in entropy from before to after the samples are split on the given feature values. In other words, how much uncertainty in the samples was reduced after splitting them on the given feature. """ N = len(samples) samples_partition = defaultdict(list) for s in samples: samples_partition[s.sample[feature]].append(s) feature_entropy = 0.0 for partition in samples_partition.values(): sub_klasses = [s.klass for s in partition] feature_entropy += (len(partition) / N) * self.entropy(sub_klasses) return self.entropy(klasses) - feature_entropy @staticmethod def entropy(dataset): """Measure of the amount of uncertainty in the given dataset.""" N = len(dataset) counter = Counter(dataset) return sum(-1.0*(counter[k] / N)*log2(counter[k] / N) for k in counter) TrainingSample = namedtuple('TrainingSample', ('sample', 'klass')) class DecisionTreeNode(dict): def __init__(self, feature, *args, **kwargs): self.feature = feature super(DecisionTreeNode, self).__init__(*args, **kwargs) def is_leaf(self): return False def __repr__(self): return "%s(%s)" % (self.__class__.__name__, self.feature) class DecisionTreeLeaf(dict): def __init__(self, klass, *args, **kwargs): self.klass = klass super(DecisionTreeLeaf, self).__init__(*args, **kwargs) def is_leaf(self): return True def __repr__(self): return "%s(%s)" % (self.__class__.__name__, self.klass) """ http://en.wikipedia.org/wiki/ID3_algorithm http://www.cise.ufl.edu/~ddd/cap6635/Fall-97/Short-papers/2.htm """ from collections import namedtuple, Counter, defaultdict from math import log2 UNCLASSIFIED_VALUE = -99 class ID3Classifier(object): def __init__(self): self.root_node = None def fit(self, samples, target): """Create the decision tree.""" training_samples = [TrainingSample(s, t) for s, t in zip(samples, target)] predicting_features = list(range(len(samples[0]))) self.root_node = self.create_decision_tree(training_samples, predicting_features) def predict(self, X, allow_unclassified=False): default_klass = 1 if allow_unclassified: default_klass = UNCLASSIFIED_VALUE predicted_klasses = [] for sample in X: klass = None current_node = self.root_node while klass is None: if current_node.is_leaf(): klass = current_node.klass else: key_value = sample[current_node.feature] if key_value in current_node: current_node = current_node[key_value] else: # UNCLASSIFIED_VALUE klass = default_klass predicted_klasses.append(klass) return predicted_klasses def score(self, X, target, allow_unclassified=True): predicted = self.predict(X, allow_unclassified=allow_unclassified) n_matches = sum(p == t for p, t in zip(predicted, target)) return 1.0 * n_matches / len(X) def create_decision_tree(self, training_samples, predicting_features): """Recursively, create a desition tree and return the parent node.""" if not predicting_features: # No more predicting features default_klass = self.get_most_common_class(training_samples) root_node = DecisionTreeLeaf(default_klass) else: klasses = [sample.klass for sample in training_samples] if len(set(klasses)) == 1: target_klass = training_samples[0].klass root_node = DecisionTreeLeaf(target_klass) else: best_feature = self.select_best_feature(training_samples, predicting_features, klasses) # Create the node to return and create the sub-tree. root_node = DecisionTreeNode(best_feature) best_feature_values = {s.sample[best_feature] for s in training_samples} for value in best_feature_values: samples = [s for s in training_samples if s.sample[best_feature] == value] # Recursively, create a child node. child = self.create_decision_tree(samples, predicting_features) root_node[value] = child return root_node @staticmethod def get_most_common_class(trainning_samples): """Return the most common class in the given samples.""" klasses = [s.klass for s in trainning_samples] counter = Counter(klasses) k, = counter.most_common(n=1) return k[0] def select_best_feature(self, samples, features, klasses): """ Select, remove from list and return the feature that best classifies the samples. The best feature is the one with higher information gain for the provided samples (mimizes entropy in klasses). """ # Implement Gain gain_factors = [(self.information_gain(samples, feat, klasses), feat) for feat in features] gain_factors.sort() best_feature = gain_factors[-1][1] features.pop(features.index(best_feature)) return best_feature def information_gain(self, samples, feature, klasses): """ Information gain is the measure of the difference in entropy from before to after the samples are split on the given feature values. In other words, how much uncertainty in the samples was reduced after splitting them on the given feature. """ N = len(samples) samples_partition = defaultdict(list) for s in samples: samples_partition[s.sample[feature]].append(s) feature_entropy = 0.0 for partition in samples_partition.values(): sub_klasses = [s.klass for s in partition] feature_entropy += (len(partition) / N) * self.entropy(sub_klasses) return self.entropy(klasses) - feature_entropy @staticmethod def entropy(dataset): """Measure of the amount of uncertainty in the given dataset.""" N = len(dataset) counter = Counter(dataset) return sum(-1.0*(counter[k] / N)*log2(counter[k] / N) for k in counter) TrainingSample = namedtuple('TrainingSample', ('sample', 'klass')) class DecisionTreeNode(dict): def __init__(self, feature, *args, **kwargs): self.feature = feature super(DecisionTreeNode, self).__init__(*args, **kwargs) def is_leaf(self): return False def __repr__(self): return "%s(%s)" % (self.__class__.__name__, self.feature) class DecisionTreeLeaf(dict): def __init__(self, klass, *args, **kwargs): self.klass = klass super(DecisionTreeLeaf, self).__init__(*args, **kwargs) def is_leaf(self): return True def __repr__(self): return "%s(%s)" % (self.__class__.__name__, self.klass)