from random import choices, choice
import pandas as pd
import numpy as np
nrows=1000
customers = ["John", "Mary", "Alex", "Smith"]
products = ["Beverage", "Meat", "Vegetable", "Fruit"]
example_data = pd.DataFrame({"customer_id":choices(customers, k=nrows), "article_id":choices(products, k=nrows)}).astype("category")
print(example_data.head())


# get most bought articles
example = example_data.groupby(["customer_id", "article_id"], observed=True).size()
print(example.head(3))
# convert it to a dataframe
example=example.reset_index().rename(columns={0:"count"})
print(example.head(3))
example.sort_values(['customer_id', "count"],inplace=True, ascending=False)
print(example.head(3))
# get two most bought
example = example.groupby("customer_id", observed=True)["article_id"].apply(lambda x: x.head(2).to_list())
print(example.head(3))

# score predictions of a recommender systems (doesn't correctly deal with repeats)


products = np.arange(10)
maximum_actuals = np.arange(3)+1
truth = pd.DataFrame({"customer":customers, 'y':[choices(products, k=choice(maximum_actuals)) for _ in range(len(customers))]})
pred = pd.DataFrame({"customer":customers, 'y':[choices(products, k=len(maximum_actuals)) for _ in range(len(customers))]})


def recall(customertruth, customerpred):
    return sum([1 for pred in customerpred if pred in customertruth])/len(customertruth)


def score(truth, pred, fun):
    combined = pd.merge(truth, pred, how="left", on="customer", suffixes=["_truth", "_pred"])
    return combined.apply(lambda x:fun(customertruth=x['y_truth'], customerpred=x['y_pred']), axis=1)

print(score(truth, truth, recall).mean())
print(score(truth, pred, recall).mean())
print("----")
print(score(truth, pred, recall).iloc[0],truth.iloc[0,1],pred.iloc[0,1])