from random import choices, choice import pandas as pd import numpy as np nrows=1000 customers = ["John", "Mary", "Alex", "Smith"] products = ["Beverage", "Meat", "Vegetable", "Fruit"] example_data = pd.DataFrame({"customer_id":choices(customers, k=nrows), "article_id":choices(products, k=nrows)}).astype("category") print(example_data.head()) # get most bought articles example = example_data.groupby(["customer_id", "article_id"], observed=True).size() print(example.head(3)) # convert it to a dataframe example=example.reset_index().rename(columns={0:"count"}) print(example.head(3)) example.sort_values(['customer_id', "count"],inplace=True, ascending=False) print(example.head(3)) # get two most bought example = example.groupby("customer_id", observed=True)["article_id"].apply(lambda x: x.head(2).to_list()) print(example.head(3)) # score predictions of a recommender systems (doesn't correctly deal with repeats) products = np.arange(10) maximum_actuals = np.arange(3)+1 truth = pd.DataFrame({"customer":customers, 'y':[choices(products, k=choice(maximum_actuals)) for _ in range(len(customers))]}) pred = pd.DataFrame({"customer":customers, 'y':[choices(products, k=len(maximum_actuals)) for _ in range(len(customers))]}) def recall(customertruth, customerpred): return sum([1 for pred in customerpred if pred in customertruth])/len(customertruth) def score(truth, pred, fun): combined = pd.merge(truth, pred, how="left", on="customer", suffixes=["_truth", "_pred"]) return combined.apply(lambda x:fun(customertruth=x['y_truth'], customerpred=x['y_pred']), axis=1) print(score(truth, truth, recall).mean()) print(score(truth, pred, recall).mean()) print("----") print(score(truth, pred, recall).iloc[0],truth.iloc[0,1],pred.iloc[0,1])