Skip to content

Instantly share code, notes, and snippets.

@Adithya-Rama
Last active January 9, 2024 19:41
Show Gist options
  • Save Adithya-Rama/fb9f275c3dcecd3337bd45a9a01aea85 to your computer and use it in GitHub Desktop.
Save Adithya-Rama/fb9f275c3dcecd3337bd45a9a01aea85 to your computer and use it in GitHub Desktop.

Revisions

  1. Adithya-Rama revised this gist Jan 9, 2024. 1 changed file with 2 additions and 1 deletion.
    3 changes: 2 additions & 1 deletion KMeansAlgorithm.py
    Original file line number Diff line number Diff line change
    @@ -65,4 +65,5 @@
    plt.scatter(X['petal length (cm)'], X['petal width (cm)'],c=colormap[y_cluster_gmm], s=40)
    plt.title('GMM Classification')

    # OUTPUT :- GRAPHS
    # OUTPUT :- GRAPHS
    ![2024-01-10](https://gist.github.com/assets/69115355/be2ca29c-68ad-48a5-a206-f49c1d9f6e3a)
  2. Adithya-Rama revised this gist Jan 9, 2024. 5 changed files with 273 additions and 0 deletions.
    26 changes: 26 additions & 0 deletions CSVFiles.txt
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,26 @@
    EnjoySport.csv

    sky air_temp humidity wind water forecast enjoy_sport
    0 sunny warm normal strong warm same yes
    1 sunny warm high strong warm same yes
    2 rainy cold high strong warm change no
    3 sunny warm high strong cool change yes


    PlayTennis.csv

    Outlook Temperature Humidity Wind Play Tennis
    0 Sunny Hot High Weak No
    1 Sunny Hot High Strong No
    2 Overcast Hot High Weak Yes
    3 Rain Mild High Weak Yes
    4 Rain Cool Normal Weak Yes
    5 Rain Cool Normal Strong No
    6 Overcast Cool Normal Strong Yes
    7 Sunny Mild High Weak No
    8 Sunny Cool Normal Weak Yes
    9 Rain Mild Normal Weak Yes
    10 Sunny Mild Normal Strong Yes
    11 Overcast Mild High Strong Yes
    12 Overcast Hot Normal Weak Yes
    13 Rain Mild High Strong No
    68 changes: 68 additions & 0 deletions KMeansAlgorithm.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,68 @@
    import matplotlib.pyplot as plt
    import numpy as np
    import pandas as pd
    from sklearn import preprocessing
    from sklearn.cluster import KMeans
    from sklearn.datasets import load_iris
    from sklearn.mixture import GaussianMixture

    iris = load_iris()
    df = pd.DataFrame(iris['data'], columns=iris['feature_names'])
    df['target'] = iris['target']
    print(df.head())

    # sepal length (cm) sepal width (cm) petal length (cm) petal width (cm) \
    # 0 5.1 3.5 1.4 0.2
    # 1 4.9 3.0 1.4 0.2
    # 2 4.7 3.2 1.3 0.2
    # 3 4.6 3.1 1.5 0.2
    # 4 5.0 3.6 1.4 0.2

    # target
    # 0 0
    # 1 0
    # 2 0
    # 3 0
    # 4 0

    X = df.iloc[:, :-1]
    Y = df['target']

    scaler = preprocessing.StandardScaler()
    scaler.fit(X)
    X_Scaled_Array = scaler.transform(X)
    X_Scaled = pd.DataFrame(X_Scaled_Array, columns = X.columns)
    print(X_Scaled.head())

    # sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
    # 0 -0.900681 1.019004 -1.340227 -1.315444
    # 1 -1.143017 -0.131979 -1.340227 -1.315444
    # 2 -1.385353 0.328414 -1.397064 -1.315444
    # 3 -1.506521 0.098217 -1.283389 -1.315444
    # 4 -1.021849 1.249201 -1.340227 -1.315444

    plt.figure(figsize=(14, 7))
    colormap = np.array(['red', 'green', 'blue'])

    #REAL PLOT
    plt.subplot(1, 3, 1)
    plt.scatter(X_Scaled['petal length (cm)'], X_Scaled['petal width (cm)'], c=colormap[Y], s=40)
    plt.title('Real')

    #K-PLOT
    plt.subplot(1, 3, 2)
    model = KMeans(n_clusters=3, random_state=0)
    pred_y = model.fit_predict(X_Scaled)
    pred_y = np.choose(pred_y, [1, 0, 2]).astype(np.int64)
    plt.scatter(X_Scaled['petal length (cm)'], X_Scaled['petal width (cm)'],c=colormap[pred_y], s=40)
    plt.title('KMeans')

    #GMM PLOT
    gmm = GaussianMixture(n_components=3, max_iter=200)
    y_cluster_gmm = gmm.fit_predict(X_Scaled)
    y_cluster_gmm = np.choose(y_cluster_gmm, [2, 0, 1]).astype(np.int64)
    plt.subplot(1, 3, 3)
    plt.scatter(X['petal length (cm)'], X['petal width (cm)'],c=colormap[y_cluster_gmm], s=40)
    plt.title('GMM Classification')

    # OUTPUT :- GRAPHS
    36 changes: 36 additions & 0 deletions KNNAlgorithm.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,36 @@
    # KNN Algorithm
    from sklearn.model_selection import train_test_split
    from sklearn.neighbors import KNeighborsClassifier
    from sklearn.datasets import load_iris

    iris = load_iris()
    print("Iris Dataset Loaded...")
    x_train, x_test, y_train, y_test = train_test_split(iris.data, iris.target, test_size=0.1)

    classifier = KNeighborsClassifier(n_neighbors=2)
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)

    print("Results of Classification using KNN with K=1 ")
    for r in range(0, len(x_test)) :
    print(f"Sample: {str(x_test[r])} Actual-Label: {str(y_test[r])} Predicted-Label: {str(y_pred[r])}")
    print(f"Classification Accuracy : {classifier.score(x_test, y_test)}")

    # OUTPUT :-
    # Results of Classification using KNN with K=1
    # Sample: [6.3 2.5 4.9 1.5] Actual-Label: 1 Predicted-Label: 2
    # Sample: [4.7 3.2 1.3 0.2] Actual-Label: 0 Predicted-Label: 0
    # Sample: [4.9 2.4 3.3 1. ] Actual-Label: 1 Predicted-Label: 1
    # Sample: [5.1 3.8 1.6 0.2] Actual-Label: 0 Predicted-Label: 0
    # Sample: [6.7 3. 5.2 2.3] Actual-Label: 2 Predicted-Label: 2
    # Sample: [5.5 3.5 1.3 0.2] Actual-Label: 0 Predicted-Label: 0
    # Sample: [6.4 2.7 5.3 1.9] Actual-Label: 2 Predicted-Label: 2
    # Sample: [5.5 2.5 4. 1.3] Actual-Label: 1 Predicted-Label: 1
    # Sample: [5.4 3. 4.5 1.5] Actual-Label: 1 Predicted-Label: 1
    # Sample: [6. 2.2 4. 1. ] Actual-Label: 1 Predicted-Label: 1
    # Sample: [6.8 3.2 5.9 2.3] Actual-Label: 2 Predicted-Label: 2
    # Sample: [7.2 3.6 6.1 2.5] Actual-Label: 2 Predicted-Label: 2
    # Sample: [4.9 2.5 4.5 1.7] Actual-Label: 2 Predicted-Label: 1
    # Sample: [5.7 2.6 3.5 1. ] Actual-Label: 1 Predicted-Label: 1
    # Sample: [6.1 2.8 4.7 1.2] Actual-Label: 1 Predicted-Label: 1
    # Classification Accuracy : 0.8666666666666667
    25 changes: 25 additions & 0 deletions LocalRegression.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,25 @@
    import numpy as np
    import matplotlib.pyplot as plt

    X = np.linspace(-3, 3, num=1000)
    domain = X
    Y = np.log(np.abs(X**2) + 0.5)

    def local_regression(X0, X, Y, tau) :
    X0 = [1, X0]
    X = [[1, i] for i in X]
    X = np.asarray(X)
    XW = (X.T) * np.exp(np.sum((X - X0) ** 2, axis=1) / (-2 * (tau ** 2)))
    beta = np.linalg.pinv(XW @ X) @ XW @ Y @ X0 # np.linearalgebra.pseudoinverse
    return beta


    def draw(tau):
    prediction = [local_regression(x0, X, Y, tau) for x0 in domain]
    plt.plot(X, Y, 'o', color='black')
    plt.plot(domain, prediction, color='red')
    plt.show()

    draw(0.1)

    # OUTPUT :- Graph
    118 changes: 118 additions & 0 deletions NaiveByesAlgorithm.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,118 @@
    import pandas as pd

    df = pd.read_csv("./Data/PlayTennis.csv")

    target = str(list(df)[-1])
    print(target)
    target_list = set(df[target])
    print(target_list)
    # Play Tennis
    # {'Yes', 'No'}

    Attr = {}
    for a in list(df)[:-1] :
    Attr[a] = set(df[a])
    print(Attr)
    # {'Outlook': {'Overcast', 'Rain', 'Sunny'}, 'Temperature': {'Cool', 'Mild', 'Hot'}, 'Humidity': {'High', 'Normal'}, 'Wind': {'Weak', 'Strong'}}

    def probAttr(data, attr, val) :
    Total = data.shape[0]
    cnt = len(data[data[attr]==val])
    return cnt, cnt / Total
    probAttr(df, target, "Yes")
    # (9, 0.6428571428571429)

    def train(data, Attr, targetVals, target) :
    targetProbs = {} # P(A)
    countTarget = {}

    for targetVal in targetVals :
    countTarget[targetVal], targetProbs[targetVal] = probAttr(data, target, targetVal)

    AttrConcept = {} # P(X/A)
    probability_list = {} # p(X)

    for att in Attr :
    probability_list[att] = {}
    AttrConcept[att] = {}

    for val in Attr[att] :
    AttrConcept[att][val] = {}
    cnt, probability_list[att][val] = probAttr(data, att, val)

    for targetVal in targetVals :
    dataTemp = data[data[att] == val]
    AttrConcept[att][val][targetVal] = len(dataTemp[dataTemp[target] == targetVal]) / countTarget[targetVal]

    print("P(A) : ", targetProbs, "\n")
    print("P(X\A) : ", AttrConcept, "\n")
    print("P(X) : ", probability_list, "\n")
    return targetProbs, AttrConcept, probability_list

    def test(examples, Attr, target_list, targetProbs, AttrConcept, probability_list) :
    misclassification_count = 0
    Total = len(examples)

    for ex in examples :
    px = {}

    for a in Attr :
    for x in ex :
    for t in target_list :
    if x in AttrConcept[a] :
    if t not in px :
    px[t] = targetProbs[t] * AttrConcept[a][x][t] / probability_list[a][x]
    else :
    px[t] = px[t] * AttrConcept[a][x][t] / probability_list[a][x]
    print(px)
    classification = max(px, key = px.get)
    print("Classification :", classification, "Expected :", ex[-1])
    if classification != ex[-1] :
    misclassification_count+=1
    misclassification_rate = misclassification_count * 100 /Total
    accuracy = 100 - misclassification_rate
    print("Misclassification Count = {}".format(misclassification_count))
    print("Misclassification Rate = {}%".format(misclassification_rate))
    print("Accuracy = {}%".format(accuracy))

    targetProbs, AttrConcept, probability_list = train(df, Attr, target_list, target)
    test(df.values, Attr, target_list, targetProbs, AttrConcept, probability_list)

    # OUTPUT :-
    # P(A) : {'Yes': 0.6428571428571429, 'No': 0.35714285714285715}

    # P(X\A) : {'Outlook': {'Overcast': {'Yes': 0.4444444444444444, 'No': 0.0}, 'Rain': {'Yes': 0.3333333333333333, 'No': 0.4}, 'Sunny': {'Yes': 0.2222222222222222, 'No': 0.6}}, 'Temperature': {'Cool': {'Yes': 0.3333333333333333, 'No': 0.2}, 'Mild': {'Yes': 0.4444444444444444, 'No': 0.4}, 'Hot': {'Yes': 0.2222222222222222, 'No': 0.4}}, 'Humidity': {'High': {'Yes': 0.3333333333333333, 'No': 0.8}, 'Normal': {'Yes': 0.6666666666666666, 'No': 0.2}}, 'Wind': {'Weak': {'Yes': 0.6666666666666666, 'No': 0.4}, 'Strong': {'Yes': 0.3333333333333333, 'No': 0.6}}}

    # P(X) : {'Outlook': {'Overcast': 0.2857142857142857, 'Rain': 0.35714285714285715, 'Sunny': 0.35714285714285715}, 'Temperature': {'Cool': 0.2857142857142857, 'Mild': 0.42857142857142855, 'Hot': 0.2857142857142857}, 'Humidity': {'High': 0.5, 'Normal': 0.5}, 'Wind': {'Weak': 0.5714285714285714, 'Strong': 0.42857142857142855}}

    # {'Yes': 0.2419753086419753, 'No': 0.9408000000000002}
    # Classification : No Expected : No
    # {'Yes': 0.16131687242798354, 'No': 1.8816000000000002}
    # Classification : No Expected : No
    # {'Yes': 0.6049382716049383, 'No': 0.0}
    # Classification : Yes Expected : Yes
    # {'Yes': 0.4839506172839506, 'No': 0.4181333333333335}
    # Classification : Yes Expected : Yes
    # {'Yes': 1.0888888888888888, 'No': 0.07840000000000004}
    # Classification : Yes Expected : Yes
    # {'Yes': 0.7259259259259259, 'No': 0.15680000000000005}
    # Classification : Yes Expected : No
    # {'Yes': 1.2098765432098766, 'No': 0.0}
    # Classification : Yes Expected : Yes
    # {'Yes': 0.3226337448559671, 'No': 0.6272000000000001}
    # Classification : No Expected : No
    # {'Yes': 0.7259259259259256, 'No': 0.11760000000000002}
    # Classification : Yes Expected : Yes
    # {'Yes': 0.9679012345679012, 'No': 0.10453333333333338}
    # Classification : Yes Expected : Yes
    # {'Yes': 0.43017832647462273, 'No': 0.31360000000000005}
    # Classification : Yes Expected : Yes
    # {'Yes': 0.5377229080932785, 'No': 0.0}
    # Classification : Yes Expected : Yes
    # {'Yes': 1.2098765432098766, 'No': 0.0}
    # Classification : Yes Expected : Yes
    # {'Yes': 0.3226337448559671, 'No': 0.8362666666666669}
    # Classification : No Expected : No
    # Misclassification Count = 1
    # Misclassification Rate = 7.142857142857143%
    # Accuracy = 92.85714285714286%
  3. Adithya-Rama created this gist Jan 9, 2024.
    141 changes: 141 additions & 0 deletions AOStarAlgorithm.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,141 @@
    def recAOStar(n):
    print("Expanding Node : ", n)
    and_nodes = []
    or_nodes = []
    #Segregation of AND and OR nodes
    if (n in allNodes):
    if 'AND' in allNodes[n]:
    and_nodes = allNodes[n]['AND']
    if 'OR' in allNodes[n]:
    or_nodes = allNodes[n]['OR']
    # If leaf node then return
    if len(and_nodes) == 0 and len(or_nodes) == 0:
    return
    solvable = False
    marked = {}

    while not solvable:
    # If all the child nodes are visited and expanded, take the least cost of all the child nodes
    if len(marked) == len(and_nodes) + len(or_nodes):
    min_cost_least, min_cost_group_least = least_cost_group(and_nodes, or_nodes, {})
    solvable = True
    change_heuristic(n, min_cost_least)
    optimal_child_group[n] = min_cost_group_least
    continue
    # Least cost of the unmarked child nodes
    min_cost, min_cost_group = least_cost_group(and_nodes, or_nodes, marked)
    is_expanded = False

    # If the child nodes have sub trees then recursively visit them to recalculate the heuristic of the child node

    if len(min_cost_group) > 1:
    if (min_cost_group[0] in allNodes):
    is_expanded = True
    recAOStar(min_cost_group[0])
    if (min_cost_group[1] in allNodes):
    is_expanded = True
    recAOStar(min_cost_group[1])
    else:
    if (min_cost_group in allNodes):
    is_expanded = True
    recAOStar(min_cost_group)
    # If the child node had any subtree and expanded, verify if the new heuristic value is still the least among all nodes
    if is_expanded:
    min_cost_verify, min_cost_group_verify = least_cost_group(and_nodes, or_nodes, {})
    if min_cost_group == min_cost_group_verify:
    solvable = True
    change_heuristic(n, min_cost_verify)
    optimal_child_group[n] = min_cost_group
    # If the child node does not have any subtrees then no change in heuristic, so update the min cost of the current node
    else:
    solvable = True
    change_heuristic(n, min_cost)
    optimal_child_group[n] = min_cost_group
    #Mark the child node which was expanded
    marked[min_cost_group] = 1
    return heuristic(n)



    # Function to calculate the min cost among all the child nodes
    def least_cost_group(and_nodes, or_nodes, marked):
    node_wise_cost = {}
    for node_pair in and_nodes:
    if not node_pair[0] + node_pair[1] in marked:
    cost = 0
    cost = cost + heuristic(node_pair[0]) + heuristic(node_pair[1]) + 2
    node_wise_cost[node_pair[0] + node_pair[1]] = cost
    for node in or_nodes:
    if not node in marked:
    cost = 0
    cost = cost + heuristic(node) + 1
    node_wise_cost[node] = cost
    min_cost = 999999
    min_cost_group = None
    # Calculates the min heuristic
    for costKey in node_wise_cost:
    if node_wise_cost[costKey] < min_cost:
    min_cost = node_wise_cost[costKey]
    min_cost_group = costKey
    return [min_cost, min_cost_group]

    # Returns heuristic of a node
    def heuristic(n):
    return H_dist[n]

    # Updates the heuristic of a node
    def change_heuristic(n, cost):
    H_dist[n] = cost
    return

    # Function to print the optimal cost nodes
    def print_path(node):
    print(optimal_child_group[node], end="")
    node = optimal_child_group[node]
    if len(node) > 1:
    if node[0] in optimal_child_group:
    print("->", end="")
    print_path(node[0])
    if node[1] in optimal_child_group:
    print("->", end="")
    print_path(node[1])
    else:
    if node in optimal_child_group:
    print("->", end="")
    print_path(node)

    #Describe the heuristic here
    H_dist = {
    'A': -1,
    'B': 4,
    'C': 2,
    'D': 3,
    'E': 6,
    'F': 8,
    'G': 2,
    'H': 0,
    'I': 0,
    'J': 0
    }

    #Describe your graph here
    allNodes = {
    'A': {'AND': [('C', 'D')], 'OR': ['B']},
    'B': {'OR': ['E', 'F']},
    'C': {'OR': ['G'], 'AND': [('H', 'I')]},
    'D': {'OR': ['J']}
    }

    optimal_child_group = {}
    optimal_cost = recAOStar('A')

    print('Nodes which gives optimal cost are')
    print_path('A')
    print('\nOptimal Cost is :: ', optimal_cost)
    print(optimal_child_group)

    # OUTPUT :-
    # Nodes which gives optimal cost are
    # CD->HI->J
    # Optimal Cost is :: 5
    # {'B': 'E', 'C': 'HI', 'D': 'J', 'A': 'CD'}
    89 changes: 89 additions & 0 deletions AStarAlgorithm.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,89 @@
    Graph_nodes = {
    'A' : [('B', 6), ('F', 3)],
    'B' : [('C', 3), ('D', 2)],
    'C' : [('D', 1), ('E', 5)],
    'D' : [('C', 1), ('E', 8)],
    'E' : [('I', 5), ('J', 5)],
    'F' : [('G', 1), ('H', 7)],
    'G' : [('I', 3)],
    'H' : [('I', 2)],
    'I' : [('E', 5), ('J', 3)],
    }

    def heuristic(v) :
    H_dist = {
    'A' : 10,
    'B' : 8,
    'C' : 5,
    'D' : 7,
    'E' : 3,
    'F' : 6,
    'G' : 5,
    'H' : 3,
    'I' : 1,
    'J' : 0,
    }
    return H_dist[v]

    def neighbors(v) :
    if v in Graph_nodes :
    return Graph_nodes[v]
    else :
    return None

    def aStarAlgo(start_node, stop_node) :

    open_set = set(start_node)
    closed_set = set()
    g = {}
    parents = {}

    g[start_node] = 0
    parents[start_node] = start_node

    while len(open_set) > 0 :
    n = None

    for v in open_set :
    if n == None or g[v] + heuristic(v) < g[n] + heuristic(n) :
    n = v

    if n == stop_node or Graph_nodes[n] == None :
    pass
    else :
    for (m, weight) in neighbors(n) :
    if m not in open_set or m not in closed_set :
    open_set.add(m)
    parents[m] = n
    g[m] = g[n] + weight
    else :
    if g[m] > g[n] + weight :
    g[m] = g[n] + weight
    parents[m] = n
    if m in closed_set :
    closed_set.remove(n)
    open_set.add(m)

    if n == None :
    print("Path doenst exist!!")
    return None
    if n == stop_node :
    path = []

    while parents[n]!=n :
    path.append(n)
    n = parents[n]

    path.append(start_node)
    path.reverse()
    print("The path is : ", path)
    return path

    open_set.remove(n)
    closed_set.add(n)
    print("Path doesnt exist")
    return None

    aStarAlgo('A', 'J')

    # Output:- ['A', 'F', 'G', 'I', 'J']
    53 changes: 53 additions & 0 deletions BackwardPropagationAlgorithm.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,53 @@
    import numpy as np

    X = np.array(([2, 9], [1, 5], [3, 6]), dtype='float')
    Y = np.array(([92], [86], [89]), dtype = 'float')
    X = X / np.amax(X , axis=0)
    Y = Y / 100

    epochs = 1000
    learning_rate = 0.6
    inputLayers = 2
    hiddenLayers = 3
    outputLayers = 1

    wh = np.random.uniform(size = (inputLayers, hiddenLayers))
    bh = np.random.uniform(size = (1, hiddenLayers))
    w0 = np.random.uniform(size = (hiddenLayers, outputLayers))
    b0 = np.random.uniform(size = (1, outputLayers))

    def sigmoid(z) :
    return 1 / (1 + np.exp(-z))

    def derivative(x) :
    return x * (1-x)

    for i in range(epochs) :
    # Forward Propagation
    z_h = np.dot(X, wh) + bh
    sigmoid_h = sigmoid(z_h)
    z_0 = np.dot(sigmoid_h, w0) + b0
    output = sigmoid(z_0)
    # Backward Propagation
    deltaK = (Y - output) * derivative(output)
    deltaH = deltaK.dot(w0.T) * derivative(sigmoid_h)
    w0 = w0 + learning_rate * sigmoid_h.T.dot(deltaK)
    wh = wh + learning_rate * X.T.dot(deltaH)
    print(f"Input:\n {X}")
    print(f"Actual Output:\n {Y} ")
    print(f"Predicted Output:\n {output}")


    # OUTPUT :-
    # Input:
    # [[0.66666667 1. ]
    # [0.33333333 0.55555556]
    # [1. 0.66666667]]
    # Actual Output:
    # [[0.92]
    # [0.86]
    # [0.89]]
    # Predicted Output:
    # [[0.89561426]
    # [0.87785989]
    # [0.89594741]]
    64 changes: 64 additions & 0 deletions BasicDecisionTreeAlgorithm.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,64 @@
    import pandas as pd
    from pandas import DataFrame
    from math import log
    from collections import Counter
    from pprint import pprint

    df_tennis = pd.read_csv('./Data/PlayTennis.csv')

    def entropy(probs):
    return sum([-prob * log(prob, 2) for prob in probs])

    def entropy_of_list(a_list):
    cnt = Counter(x for x in a_list)
    num_instances = len(a_list) * 1.0
    probs = [x / num_instances for x in cnt.values()]
    return entropy(probs)

    def information_gain(df, split_attribute_name, target_attribute_name):
    df_split = df.groupby(split_attribute_name)
    nobs = len(df.index) * 1.0
    df_agg_ent = df_split.agg({target_attribute_name: [entropy_of_list, lambda x: len(x)/nobs]})[target_attribute_name]
    df_agg_ent.columns = ['Entropy', 'PropObservations']
    new_entropy = sum(df_agg_ent['Entropy'] * df_agg_ent['PropObservations'])
    old_entropy = entropy_of_list(df[target_attribute_name])
    return old_entropy - new_entropy

    def id3(df, target_attribute_name, attribute_names, default_class=None):
    cnt = Counter(x for x in df[target_attribute_name])
    print(cnt)
    if len(cnt) == 1:
    return next(iter(cnt))
    elif df.empty or (not attribute_names):
    return default_class
    else:
    default_class = max(cnt.keys())
    gainz = [information_gain(df, attr, target_attribute_name) for attr in attribute_names]
    index_of_max = gainz.index(max(gainz))
    best_attr = attribute_names[index_of_max]
    tree = {best_attr:{}}
    remaining_attribute_names = [i for i in attribute_names if i != best_attr]

    for attr_val, data_subset in df.groupby(best_attr):
    subtree = id3(data_subset, target_attribute_name, remaining_attribute_names, default_class)
    tree[best_attr][attr_val] = subtree
    return tree

    attribute_names = list(df_tennis.columns)

    print("List of attributes: ", attribute_names)

    attribute_names.remove('Play Tennis')

    print("Predicting Attributes: ", attribute_names)

    tree = id3(df_tennis, 'Play Tennis', attribute_names)

    print("\n\nThe Resultant Decistion Tree is: \n")
    pprint(tree)

    # OUTPUT :-
    # The Resultant Decistion Tree is:
    # {'Outlook': {'Overcast': 'Yes',
    # 'Rain': {'Wind': {'Strong': 'No', 'Weak': 'Yes'}},
    # 'Sunny': {'Humidity': {'High': 'No', 'Normal': 'Yes'}}}}
    36 changes: 36 additions & 0 deletions CandidateEliminationAlgorithm.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,36 @@
    import pandas as pd

    df = pd.read_csv("./Data/EnjoySport.csv")
    print(df.head())

    concepts = df.values[:,:-1]
    target = df.values[:, -1]

    def learn(concepts, target) :
    specific_h = concepts[0].copy()
    print(specific_h)
    general_h = [["?" for i in range(len(specific_h))] for i in range(len(specific_h))]
    print(general_h)
    for i, h in enumerate(concepts) :
    if target[i] == "yes" :
    for x in range(len(specific_h)) :
    if h[x] != specific_h[x] :
    specific_h[x] = "?"
    general_h[x][x] = "?"
    if target[i] == "no" :
    for x in range(len(specific_h)) :
    if h[x] != specific_h[x] :
    general_h[x][x] = specific_h[x]
    else :
    general_h[x][x] = "?"
    indices = [i for i, val in enumerate(general_h) if val == ["?", "?", "?", "?", "?", "?"]]
    for i in indices :
    general_h.remove(["?", "?", "?", "?", "?", "?"])
    print(specific_h)
    print(general_h)

    learn(concepts, target)

    # OUTPUT:-
    # ['sunny' 'warm' '?' 'strong' '?' '?']
    # [['sunny', '?', '?', '?', '?', '?'], ['?', 'warm', '?', '?', '?', '?']]