Finesim97 · April 29, 2022 17:44
diff --git a/examples.py b/examples.py
 from random import choices, choice
 import pandas as pd
 import numpy as np
 nrows=1000
 customers = ["John", "Mary", "Alex", "Smith"]
 products = ["Beverage", "Meat", "Vegetable", "Fruit"]
 example_data = pd.DataFrame({"customer_id":choices(customers, k=nrows), "article_id":choices(products, k=nrows)}).astype("category")
 print(example_data.head())


 # get most bought articles
 example = example_data.groupby(["customer_id", "article_id"], observed=True).size()
 print(example.head(3))
 # convert it to a dataframe
 example=example.reset_index().rename(columns={0:"count"})
 print(example.head(3))
 example.sort_values(['customer_id', "count"],inplace=True, ascending=False)
 print(example.head(3))
 # get two most bought
 example = example.groupby("customer_id", observed=True)["article_id"].apply(lambda x: x.head(2).to_list())
 print(example.head(3))

 # score predictions of a recommender systems (doesn't correctly deal with repeats)



 products = np.arange(10)
 maximum_actuals = np.arange(3)+1
 truth = pd.DataFrame({"customer":customers, 'y':[choices(products, k=choice(maximum_actuals)) for _ in range(len(customers))]})
 pred = pd.DataFrame({"customer":customers, 'y':[choices(products, k=len(maximum_actuals)) for _ in range(len(customers))]})


 def recall(customertruth, customerpred):
    return sum([1 for pred in customerpred if pred in customertruth])/len(customertruth)



 def score(truth, pred, fun):
    combined = pd.merge(truth, pred, how="left", on="customer", suffixes=["_truth", "_pred"])
    return combined.apply(lambda x:fun(customertruth=x['y_truth'], customerpred=x['y_pred']), axis=1)

 print(score(truth, truth, recall).mean())
 print(score(truth, pred, recall).mean())
 print("----")
 print(score(truth, pred, recall).iloc[0],truth.iloc[0,1],pred.iloc[0,1])
	from random import choices, choice
	import pandas as pd
	import numpy as np
	nrows=1000
	customers = ["John", "Mary", "Alex", "Smith"]
	products = ["Beverage", "Meat", "Vegetable", "Fruit"]
	example_data = pd.DataFrame({"customer_id":choices(customers, k=nrows), "article_id":choices(products, k=nrows)}).astype("category")
	print(example_data.head())


	# get most bought articles
	example = example_data.groupby(["customer_id", "article_id"], observed=True).size()
	print(example.head(3))
	# convert it to a dataframe
	example=example.reset_index().rename(columns={0:"count"})
	print(example.head(3))
	example.sort_values(['customer_id', "count"],inplace=True, ascending=False)
	print(example.head(3))
	# get two most bought
	example = example.groupby("customer_id", observed=True)["article_id"].apply(lambda x: x.head(2).to_list())
	print(example.head(3))

	# score predictions of a recommender systems (doesn't correctly deal with repeats)



	products = np.arange(10)
	maximum_actuals = np.arange(3)+1
	truth = pd.DataFrame({"customer":customers, 'y':[choices(products, k=choice(maximum_actuals)) for _ in range(len(customers))]})
	pred = pd.DataFrame({"customer":customers, 'y':[choices(products, k=len(maximum_actuals)) for _ in range(len(customers))]})


	def recall(customertruth, customerpred):
	return sum([1 for pred in customerpred if pred in customertruth])/len(customertruth)



	def score(truth, pred, fun):
	combined = pd.merge(truth, pred, how="left", on="customer", suffixes=["_truth", "_pred"])
	return combined.apply(lambda x:fun(customertruth=x['y_truth'], customerpred=x['y_pred']), axis=1)

	print(score(truth, truth, recall).mean())
	print(score(truth, pred, recall).mean())
	print("----")
	print(score(truth, pred, recall).iloc[0],truth.iloc[0,1],pred.iloc[0,1])