Created
June 30, 2018 13:51
-
-
Save meer-online/6bb919c0abb90b0ad339dee6566492d2 to your computer and use it in GitHub Desktop.
Revisions
-
meer-online created this gist
Jun 30, 2018 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,152 @@ # Required Python Packages import pandas as pd from sklearn.model_selection import train_test_split from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy_score from sklearn.metrics import confusion_matrix import pdb # File Paths INPUT_PATH = "breast-cancer-wisconsin.data" OUTPUT_PATH = "breast-cancer-wisconsin.csv" # Headers HEADERS = ["CodeNumber", "ClumpThickness", "UniformityCellSize", "UniformityCellShape", "MarginalAdhesion", "SingleEpithelialCellSize", "BareNuclei", "BlandChromatin", "NormalNucleoli", "Mitoses", "CancerType"] def read_data(path): """ Read the data into pandas dataframe :param path: :return: """ data = pd.read_csv(path) return data def get_headers(dataset): """ dataset headers :param dataset: :return: """ return dataset.columns.values def add_headers(dataset, headers): """ Add the headers to the dataset :param dataset: :param headers: :return: """ dataset.columns = headers return dataset def data_file_to_csv(): """ :return: """ # Headers headers = ["CodeNumber", "ClumpThickness", "UniformityCellSize", "UniformityCellShape", "MarginalAdhesion", "SingleEpithelialCellSize", "BareNuclei", "BlandChromatin", "NormalNucleoli", "Mitoses", "CancerType"] # Load the dataset into Pandas data frame dataset = read_data(INPUT_PATH) # Add the headers to the loaded dataset dataset = add_headers(dataset, headers) # Save the loaded dataset into csv format dataset.to_csv(OUTPUT_PATH, index=False) print "File saved ...!" def split_dataset(dataset, train_percentage, feature_headers, target_header): """ Split the dataset with train_percentage :param dataset: :param train_percentage: :param feature_headers: :param target_header: :return: train_x, test_x, train_y, test_y """ # Split dataset into train and test dataset train_x, test_x, train_y, test_y = train_test_split(dataset[feature_headers], dataset[target_header], train_size=train_percentage) return train_x, test_x, train_y, test_y def handel_missing_values(dataset, missing_values_header, missing_label): """ Filter missing values from the dataset :param dataset: :param missing_values_header: :param missing_label: :return: """ return dataset[dataset[missing_values_header] != missing_label] def random_forest_classifier(features, target): """ To train the random forest classifier with features and target data :param features: :param target: :return: trained random forest classifier """ clf = RandomForestClassifier() clf.fit(features, target) return clf def dataset_statistics(dataset): """ Basic statistics of the dataset :param dataset: Pandas dataframe :return: None, print the basic statistics of the dataset """ print dataset.describe() def main(): """ Main function :return: """ # Load the csv file into pandas dataframe dataset = pd.read_csv(OUTPUT_PATH) # Get basic statistics of the loaded dataset dataset_statistics(dataset) # Filter missing values dataset = handel_missing_values(dataset, HEADERS[6], '?') train_x, test_x, train_y, test_y = split_dataset(dataset, 0.7, HEADERS[1:-1], HEADERS[-1]) # Train and Test dataset size details print "Train_x Shape :: ", train_x.shape print "Train_y Shape :: ", train_y.shape print "Test_x Shape :: ", test_x.shape print "Test_y Shape :: ", test_y.shape # Create random forest classifier instance trained_model = random_forest_classifier(train_x, train_y) print "Trained model :: ", trained_model predictions = trained_model.predict(test_x) for i in xrange(0, 5): print "Actual outcome :: {} and Predicted outcome :: {}".format(list(test_y)[i], predictions[i]) print "Train Accuracy :: ", accuracy_score(train_y, trained_model.predict(train_x)) print "Test Accuracy :: ", accuracy_score(test_y, predictions) print " Confusion matrix ", confusion_matrix(test_y, predictions) if __name__ == "__main__": main() http://dataaspirant.com/2017/06/26/random-forest-classifier-python-scikit-learn/