Skip to content

Instantly share code, notes, and snippets.

@Finesim97
Last active April 29, 2022 17:44
Show Gist options
  • Save Finesim97/adf35b1d057dada6b93a3e3c40d57749 to your computer and use it in GitHub Desktop.
Save Finesim97/adf35b1d057dada6b93a3e3c40d57749 to your computer and use it in GitHub Desktop.
Helpful stuff dealing with transactional (bought stuff) in pandas
from random import choices, choice
import pandas as pd
import numpy as np
nrows=1000
customers = ["John", "Mary", "Alex", "Smith"]
products = ["Beverage", "Meat", "Vegetable", "Fruit"]
example_data = pd.DataFrame({"customer_id":choices(customers, k=nrows), "article_id":choices(products, k=nrows)}).astype("category")
print(example_data.head())
# get most bought articles
example = example_data.groupby(["customer_id", "article_id"], observed=True).size()
print(example.head(3))
# convert it to a dataframe
example=example.reset_index().rename(columns={0:"count"})
print(example.head(3))
example.sort_values(['customer_id', "count"],inplace=True, ascending=False)
print(example.head(3))
# get two most bought
example = example.groupby("customer_id", observed=True)["article_id"].apply(lambda x: x.head(2).to_list())
print(example.head(3))
# score predictions of a recommender systems (doesn't correctly deal with repeats)
products = np.arange(10)
maximum_actuals = np.arange(3)+1
truth = pd.DataFrame({"customer":customers, 'y':[choices(products, k=choice(maximum_actuals)) for _ in range(len(customers))]})
pred = pd.DataFrame({"customer":customers, 'y':[choices(products, k=len(maximum_actuals)) for _ in range(len(customers))]})
def recall(customertruth, customerpred):
return sum([1 for pred in customerpred if pred in customertruth])/len(customertruth)
def score(truth, pred, fun):
combined = pd.merge(truth, pred, how="left", on="customer", suffixes=["_truth", "_pred"])
return combined.apply(lambda x:fun(customertruth=x['y_truth'], customerpred=x['y_pred']), axis=1)
print(score(truth, truth, recall).mean())
print(score(truth, pred, recall).mean())
print("----")
print(score(truth, pred, recall).iloc[0],truth.iloc[0,1],pred.iloc[0,1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment