In [1]:
import ast
import os
import random
import kagglehub
import json

import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from copy import deepcopy

pd.options.display.max_colwidth = None

In [2]:
from collections import defaultdict
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Competition data

In [3]:
data_dir = "/Users/rajabiswas/.cache/kagglehub/competitions/eedi-mining-misconceptions-in-mathematics"

df = pd.read_csv(os.path.join(data_dir, "train.csv"))
content_df = pd.read_csv(os.path.join(data_dir, "misconception_mapping.csv"))

In [4]:
fold_df = pd.read_parquet("../data/scratch/five_folds.parquet")
df = pd.merge(df, fold_df, on="QuestionId")

fold = 0
train_df = df[df.kfold != fold].copy()
valid_df = df[df.kfold == fold].copy()

In [5]:
train_df.shape, valid_df.shape

((1495, 16), (374, 16))

In [6]:
valid_misconception_ids = set()
for idx, row in valid_df.iterrows():
    for letter in ["A", "B", "C", "D"]:
        if pd.notna(row[f"Misconception{letter}Id"]):
            valid_misconception_ids.add(row[f"Misconception{letter}Id"])
print(f"# of validation misconceptions: {len(valid_misconception_ids)}")

train_misconception_ids = set()
for idx, row in train_df.iterrows():
    for letter in ["A", "B", "C", "D"]:
        if pd.notna(row[f"Misconception{letter}Id"]):
            train_misconception_ids.add(row[f"Misconception{letter}Id"])
print(f"# of training misconceptions: {len(train_misconception_ids)}")

# remove intersections
valid_misconception_ids = valid_misconception_ids - train_misconception_ids
print(f"# of new validation misconceptions: {len(valid_misconception_ids)}")

# of validation misconceptions: 435
# of training misconceptions: 1378
# of new validation misconceptions: 226


In [7]:
vqids = valid_df['QuestionId'].unique().tolist()

# Ranker Predictions

In [9]:
data_dir = kagglehub.dataset_download("conjuring92/eedi-ranker-silver-v3-teacher")
ranker_df = pd.read_parquet(os.path.join(data_dir, "train.parquet"))
ranker_df_valid = pd.read_parquet(os.path.join(data_dir, "valid_ff.parquet"))

In [10]:
ranker_df.label.value_counts()

label
0    441400
1     18948
Name: count, dtype: int64

In [11]:
ranker_df.sample()

Unnamed: 0,query_id,content_id,SubjectName,ConstructName,QuestionText,CorrectAnswerText,InCorrectAnswerText,MisconceptionName,AllOptionText,label,teacher_score
231856,60489_A,473,Fractal Geometry and Iterative Patterns,Interpret a pictogram where the symbols are not evenly spaced,"![A pictogram showing a symbol of a triangle. The symbol is repeated 5 times in a row for the first category, then 4 times with the second triangle symbol being slightly further apart from the others for the second category, then 3 times with the third triangle symbol being slightly further apart from the others for the third category.]() Which shape has the greatest frequency?",They all have the same frequency,The first shape,Underestimates the impact of the size of images in a misleading statistical diagram,\n- The first shape\n- The second shape\n- The third shape\n- They all have the same frequency,0,-0.625


In [16]:
# remove examples with low teacher scores for positives --
teacher_cutoff = 2.0 # 3.0
bad_df = ranker_df[(ranker_df['label']==1) & (ranker_df['teacher_score']<teacher_cutoff)].copy()
bad_df.shape

(2178, 11)

In [17]:
bad_qids = bad_df['query_id'].values.tolist()
bad_qids = [x.split("_")[0] for x in bad_qids]
bad_qids = list(map(int, bad_qids))
bad_qids = [x for x in bad_qids if x >= 2000]
len(bad_qids)

2085

In [18]:
ranker_df["QuestionId"] = ranker_df['query_id'].apply(lambda x: x.split("_")[0])
ranker_df["QuestionId"] = ranker_df["QuestionId"].astype(int)

In [19]:
ranker_df = ranker_df[~ranker_df['QuestionId'].isin(bad_qids)].copy()
ranker_df = ranker_df.reset_index(drop=True)

In [20]:
ranker_df.shape

(384909, 12)

# MCQ Data

In [22]:
data_dir = kagglehub.dataset_download("conjuring92/eedi-silver-v3")
mcq_df = pd.read_csv(os.path.join(data_dir, "train.csv"))
content_df = pd.read_csv(os.path.join(data_dir, "misconception_mapping.csv"))

for letter in ['A', 'B', 'C', 'D']:
    mcq_df = mcq_df.merge(content_df, left_on=f'Misconception{letter}Id', right_on='MisconceptionId', how='left')
    mcq_df = mcq_df.rename(columns={'MisconceptionName': f'Misconception{letter}Name'})
    mcq_df = mcq_df.drop('MisconceptionId', axis=1)
mcq_df.shape

(12473, 20)

In [23]:
mcq_df = mcq_df[~mcq_df['QuestionId'].isin(bad_qids)].copy()
mcq_df.shape

(10594, 20)

In [24]:
# FULLFIT = True

# if not FULLFIT:
#     mcq_df = mcq_df[~mcq_df["QuestionId"].isin(vqids)].copy()
#     mcq_df = mcq_df.reset_index(drop=True)
#     mcq_df.shape

In [25]:
mcq_df.sample()

Unnamed: 0,QuestionId,ConstructId,ConstructName,SubjectId,SubjectName,CorrectAnswer,QuestionText,AnswerAText,AnswerBText,AnswerCText,AnswerDText,MisconceptionAId,MisconceptionBId,MisconceptionCId,MisconceptionDId,source,MisconceptionAName,MisconceptionBName,MisconceptionCName,MisconceptionDName
496,300644,-1,Convert between g and tonne,-1,Weight Units,D,A large container weighs \( 2 \mathrm{~tonnes} \). What is this in grams?,\( 2000 \mathrm{~g} \),\( 20000 \mathrm{~g} \),\( 200000 \mathrm{~g} \),\( 2000000 \mathrm{~g} \),666.0,765.0,784.0,,group,Thinks grams and tonnes are the same,Thinks there are 10kg in a tonne,Thinks there are 100g in a kilogram,


In [26]:
def count_nonempty_misconceptions(df):
    total_count = 0
    for letter in ['A', 'B', 'C', 'D']:
        count = df[f'Misconception{letter}Name'].notna().sum()
        total_count += count
    print(f"Total non-empty MisconceptionNames: {total_count}")

count_nonempty_misconceptions(mcq_df)

Total non-empty MisconceptionNames: 16706


In [27]:
mcq_df = mcq_df.drop_duplicates(subset=['QuestionId'])
mcq_df = mcq_df.dropna(subset=['MisconceptionAId', 'MisconceptionBId', 'MisconceptionCId', 'MisconceptionDId'], how='all')
mcq_df.shape

(10594, 20)

In [28]:
mcq_df = mcq_df[~mcq_df.apply(lambda row: pd.notna(row[f'Misconception{row.CorrectAnswer}Id']), axis=1)]
print(f"Shape after dropping: {mcq_df.shape}")

Shape after dropping: (10594, 20)


In [29]:
# mcq_df.tail()

# Maps 

In [30]:
mcq_df.shape, content_df.shape

((10594, 20), (4791, 2))

In [31]:
pred_map = defaultdict(list)
teacher_score_map = {}

for _, row in ranker_df.iterrows():
    query_id = row['query_id']
    content_id = row['content_id']
    teacher_score = row['teacher_score']
    
    pred_map[query_id].append(content_id)
    teacher_score_map[f"{query_id}|{content_id}"] = teacher_score

In [32]:
for _, row in ranker_df_valid.iterrows():
    query_id = row['query_id']
    content_id = row['content_id']
    teacher_score = row['teacher_score']
    
    pred_map[query_id].append(content_id)
    teacher_score_map[f"{query_id}|{content_id}"] = teacher_score

# Denoise HN

In [33]:
pred_df = pd.DataFrame(list(pred_map.items()), columns=['query_id', 'content_ids'])
pred_df.head()

Unnamed: 0,query_id,content_ids
0,1_A,"[2142, 2398, 2581, 2068, 838, 1755, 418, 2372, 5727, 167, 1871, 143, 2078, 2277, 2070, 891, 1256, 2256, 4523, 1535, 519, 1421, 1606, 113, 628]"
1,1_B,"[143, 891, 167, 418, 2078, 220, 979, 519, 2068, 1540, 2372, 4522, 5207, 1755, 3610, 2567, 1593, 4523, 628, 80, 113, 59, 1871, 1153]"
2,1_C,"[2142, 143, 2277, 2070, 167, 1079, 3464, 418, 547, 2581, 838, 320, 519, 1755, 113, 5727, 3412, 2436, 891, 1421, 2068, 1535, 3632, 688]"
3,2_A,"[1287, 676, 4964, 1073, 1866, 5156, 2386, 3475, 3974, 276, 1521, 2555, 4245, 1408, 4136, 310, 5661, 4297, 3908, 632, 5550, 306, 1797, 4261, 912]"
4,2_C,"[1287, 306, 2551, 1338, 5550, 5156, 1408, 2319, 3908, 365, 1975, 2439, 4261, 3974, 5661, 397, 1073, 4136, 4245, 3438, 1059, 691, 4379, 3417]"


In [34]:
pred_df[pred_df['query_id']=='0_D']

Unnamed: 0,query_id,content_ids
15855,0_D,"[1672, 1005, 1507, 2532, 1392, 706, 2488, 2306, 315, 1345, 1516, 4377, 5251, 328, 5594, 3524, 3353, 2518, 871, 4557, 158, 4465, 1999, 987, 1963, 2051, 2181, 2449, 638, 5622, 5528, 4789, 3110, 2586, 256, 1226, 488, 1316, 1011, 5587, 4149, 1670, 657, 4051, 5241, 1090, 1941, 1336]"


In [35]:
true_map = dict()

for idx, row in mcq_df.iterrows():
    for letter in "ABCD":
        query_id = f"{row['QuestionId']}_{letter}"
        misconception_id = row[f'Misconception{letter}Id']
        if pd.notna(misconception_id):
            true_map[query_id] = int(misconception_id)

pred_df['true_id'] = pred_df['query_id'].map(true_map)
pred_df.head()

Unnamed: 0,query_id,content_ids,true_id
0,1_A,"[2142, 2398, 2581, 2068, 838, 1755, 418, 2372, 5727, 167, 1871, 143, 2078, 2277, 2070, 891, 1256, 2256, 4523, 1535, 519, 1421, 1606, 113, 628]",2142
1,1_B,"[143, 891, 167, 418, 2078, 220, 979, 519, 2068, 1540, 2372, 4522, 5207, 1755, 3610, 2567, 1593, 4523, 628, 80, 113, 59, 1871, 1153]",143
2,1_C,"[2142, 143, 2277, 2070, 167, 1079, 3464, 418, 547, 2581, 838, 320, 519, 1755, 113, 5727, 3412, 2436, 891, 1421, 2068, 1535, 3632, 688]",2142
3,2_A,"[1287, 676, 4964, 1073, 1866, 5156, 2386, 3475, 3974, 276, 1521, 2555, 4245, 1408, 4136, 310, 5661, 4297, 3908, 632, 5550, 306, 1797, 4261, 912]",1287
4,2_C,"[1287, 306, 2551, 1338, 5550, 5156, 1408, 2319, 3908, 365, 1975, 2439, 4261, 3974, 5661, 397, 1073, 4136, 4245, 3438, 1059, 691, 4379, 3417]",1287


In [36]:
mcq_df.shape

(10594, 20)

In [37]:
pred_df[pred_df['query_id']=='0_D']

Unnamed: 0,query_id,content_ids,true_id
15855,0_D,"[1672, 1005, 1507, 2532, 1392, 706, 2488, 2306, 315, 1345, 1516, 4377, 5251, 328, 5594, 3524, 3353, 2518, 871, 4557, 158, 4465, 1999, 987, 1963, 2051, 2181, 2449, 638, 5622, 5528, 4789, 3110, 2586, 256, 1226, 488, 1316, 1011, 5587, 4149, 1670, 657, 4051, 5241, 1090, 1941, 1336]",1672


In [38]:
pred_df = pred_df[~pred_df['true_id'].isna()].copy()
pred_df = pred_df.reset_index(drop=True)
pred_df.shape

(16706, 3)

In [39]:
def get_true_content_index(row):
    try:
        return row['content_ids'].index(row['true_id'])
    except ValueError:
        return -1

pred_df['true_content_index'] = pred_df.apply(get_true_content_index, axis=1)

In [40]:
# pred_df.true_content_index.value_counts()

In [41]:
pred_df['true_id'] = pred_df['true_id'].astype(int)

In [42]:
pred_df.sample(5)

Unnamed: 0,query_id,content_ids,true_id,true_content_index
8622,72415_B,"[1925, 74, 3383, 1066, 2377, 1115, 2376, 1226, 2058, 845, 2252, 108, 1011, 4203, 969, 162, 1411, 939, 2386, 102, 373, 558, 1058, 4167]",1925,0
15058,403236_A,"[1736, 4687, 4697, 5234, 4475, 4482, 427, 299, 4302, 4545, 3108, 4686, 581, 4039, 4851, 1902, 17, 3099, 348, 13, 3615, 1339, 3177, 1013]",1736,0
4602,11518_D,"[4388, 5056, 440, 210, 4027, 2253, 5364, 3813, 672, 3012, 4198, 469, 215, 1474, 3811, 5078, 3751, 936, 2284, 713, 1240, 3113, 3317, 3200, 3658]",4388,0
6363,14516_B,"[768, 1375, 2560, 530, 607, 274, 4317, 5660, 174, 973, 4447, 2440, 5657, 5698, 4220, 4578, 4459, 1763, 4452, 1201, 4481, 5006, 5631, 4357]",768,0
9729,300787_A,"[2345, 649, 1780, 660, 1678, 4452, 4176, 2528, 1292, 743, 881, 2353, 4999, 1668, 1802, 4417, 653, 4048, 4588, 828, 2450, 71, 1671, 2090, 584]",2345,0


In [43]:
# pred_df.true_content_index.value_counts()

In [44]:
def add_teacher_scores(row):
    return [teacher_score_map.get(f"{row['query_id']}|{content_id}", 0) for content_id in row['content_ids']]

pred_df['teacher_logits'] = pred_df.apply(add_teacher_scores, axis=1)

In [45]:
pred_df[pred_df['query_id']=='0_D']

Unnamed: 0,query_id,content_ids,true_id,true_content_index,teacher_logits
15855,0_D,"[1672, 1005, 1507, 2532, 1392, 706, 2488, 2306, 315, 1345, 1516, 4377, 5251, 328, 5594, 3524, 3353, 2518, 871, 4557, 158, 4465, 1999, 987, 1963, 2051, 2181, 2449, 638, 5622, 5528, 4789, 3110, 2586, 256, 1226, 488, 1316, 1011, 5587, 4149, 1670, 657, 4051, 5241, 1090, 1941, 1336]",1672,0,"[3.5, 5.249999903142452, 5.499999988824129, 5.9999997690320015, 4.249999910593033, 4.812499798834324, 2.187500074505806, 3.937499776482582, 4.812499836087227, -0.5624999403953552, 4.187499985098839, -0.125, 1.6875000298023224, 3.937499962747097, 3.2500000298023224, 1.5000000596046448, 1.4375, 3.1250000447034836, -0.9375, 1.1874999403953552, 0.0625, 2.499999985098839, -0.625, -1.4999999701976776, 3.187500074505806, -1.3749999701976776, 2.375000014901161, 0.5, -1.1875000596046448, 0.1875, -0.4375, 1.7499999105930328, 1.0000000298023224, 1.2500000596046448, -1.0625, -0.5625000596046448, -0.625, -2.4999999403953552, -0.5, 1.3124999403953552, -0.8750000596046448, -1.8749999403953552, 0.0, -2.6875000298023224, -0.7500000596046448, -1.1875000298023224, 1.8749999552965164, 0.1875]"


In [46]:
def stable_softmax(x, temp=1.0):
    x = np.array(x) / temp
    x_max = np.max(x)
    exp_x = np.exp(x - x_max)
    return exp_x / np.sum(exp_x)

In [47]:
pred_df['teacher_probs'] = pred_df['teacher_logits'].apply(stable_softmax)

In [48]:
pred_df.sample()

Unnamed: 0,query_id,content_ids,true_id,true_content_index,teacher_logits,teacher_probs
2618,1382_B,"[172, 1081, 2078, 1166, 4701, 1246, 1510, 1048, 633, 2170, 5017, 1909, 546, 684, 907, 1514, 1206, 5457, 4658, 217, 2147, 896, 2128, 1916]",172,0,"[7.312499975785613, 2.0625, 3.312500111758709, -0.8125, 1.1874999701976776, 0.9375000298023224, 1.5624999403953552, 2.3124999552965164, -1.5625, -0.8125, -1.2500000596046448, 1.3125, 0.125, 1.5625, 6.687499949708581, 2.062500014901161, 3.687500111758709, -0.8749999403953552, 0.8125, 1.2499999701976776, 1.3124999701976776, -0.1875, 0.4375, 2.937499925494194]","[0.612506021827765, 0.0032141366969789405, 0.011218440638378482, 0.00018132918428097404, 0.001339851475094765, 0.0010434774401992801, 0.0019494723350276118, 0.004127033027136988, 8.56538416901439e-05, 0.00018132918428097404, 0.00011707491365117373, 0.0015182506715903333, 0.0004630402934190451, 0.0019494724512252213, 0.3278508396706364, 0.0032141367448733096, 0.016322734814244625, 0.00017034301453609357, 0.0009208655814488337, 0.0014262644710112977, 0.001518250626342938, 0.0003387675154974123, 0.0006329010413373982, 0.007710312539352721]"


In [49]:
pred_df['pos_score'] = pred_df.apply(lambda row: row['teacher_probs'][row['true_content_index']] if row['true_content_index'] != -1 else 0, axis=1)
pred_df['pos_score'].describe()

count    16706.000000
mean         0.477579
std          0.319052
min          0.000087
25%          0.174123
50%          0.465565
75%          0.775886
max          0.999168
Name: pos_score, dtype: float64

In [50]:
all_content_ids = set(pred_df['true_id'].values.tolist())
print(len(all_content_ids))

4118


In [51]:
def filter_content_ids(row):
    return [cid for cid, score in zip(row['content_ids'], row['teacher_probs']) if score < row['cutoff']]

def fill_to_n(row, n=24):
    if row['num_filtered'] >= n:
        return row['filtered_content_ids'][:n]
    else:
        additional_needed = n - row['num_filtered']
        candidates = list(all_content_ids - set(row['filtered_content_ids']))
        additional_ids = random.sample(candidates, additional_needed)
        return row['filtered_content_ids'] + additional_ids

In [52]:
margin = 0.9

curr_df = deepcopy(pred_df)
curr_df['cutoff'] = curr_df['pos_score'].apply(lambda x: x * margin)
curr_df['filtered_content_ids'] = curr_df.apply(filter_content_ids, axis=1)
curr_df['num_filtered'] = curr_df['filtered_content_ids'].apply(len)

In [53]:
# curr_df['num_filtered'].value_counts()

In [54]:
# curr_df['num_filtered'].value_counts()

In [55]:
curr_df['final_content_ids'] = curr_df.apply(fill_to_n, axis=1)
curr_df['final_num_filtered'] = curr_df['final_content_ids'].apply(len)

neg_map = dict(zip(curr_df['query_id'], curr_df['final_content_ids']))

In [56]:
curr_df['final_num_filtered'].value_counts()

final_num_filtered
24    16706
Name: count, dtype: int64

In [57]:
len(neg_map['780_D'])

24

# Save

In [58]:
save_dir = "../data/embedding_mix/silver_v3"
os.makedirs(save_dir, exist_ok=True)

In [59]:
with open(os.path.join(save_dir, f"hn_mapping.json"), "w") as f:
    json.dump(neg_map, f)

In [60]:
keep_cols = [
    'QuestionId',
    'ConstructId',
    'ConstructName',
    'SubjectId',
    'SubjectName',
    
    'CorrectAnswer',
    'QuestionText',
    'AnswerAText',
    'AnswerBText',
    'AnswerCText',
    'AnswerDText',
    
    'MisconceptionAId',
    'MisconceptionBId',
    'MisconceptionCId',
    'MisconceptionDId',
]

valid_df = valid_df[keep_cols].copy()
mcq_df = mcq_df[keep_cols].copy()

# ff_df = pd.concat([mcq_df, valid_df]).reset_index(drop=True)
ff_df = mcq_df.copy() # pd.concat([mcq_df, valid_df]).reset_index(drop=True)

ff_df.to_csv(f"{save_dir}/train.csv", index=False)

In [61]:
content_df.to_csv(f"{save_dir}/misconception_mapping.csv", index=False)

In [62]:
with open(os.path.join(save_dir, f"teacher_mapping.json"), "w") as f:
    json.dump(teacher_score_map, f)

In [63]:
kagglehub.dataset_upload("conjuring92/eedi-embed-mix-silver-v3", save_dir)

Uploading Dataset https://www.kaggle.com/datasets/conjuring92/eedi-embed-mix-silver-v3 ...
Starting upload for file ../data/embedding_mix/silver_v3/teacher_mapping.json


Uploading: 100%|█| 13.7M/13.7M [00:08<00:00, 1.60

Upload successful: ../data/embedding_mix/silver_v3/teacher_mapping.json (13MB)
Starting upload for file ../data/embedding_mix/silver_v3/hn_mapping.json



Uploading: 100%|█| 2.52M/2.52M [00:02<00:00, 881k

Upload successful: ../data/embedding_mix/silver_v3/hn_mapping.json (2MB)
Starting upload for file ../data/embedding_mix/silver_v3/misconception_mapping.csv



Uploading: 100%|█| 323k/323k [00:02<00:00, 154kB/

Upload successful: ../data/embedding_mix/silver_v3/misconception_mapping.csv (315KB)
Starting upload for file ../data/embedding_mix/silver_v3/train.csv



Uploading: 100%|█| 3.13M/3.13M [00:02<00:00, 1.09

Upload successful: ../data/embedding_mix/silver_v3/train.csv (3MB)





Your dataset instance has been created.
Files are being processed...
See at: https://www.kaggle.com/datasets/conjuring92/eedi-embed-mix-silver-v3


In [96]:
# teacher_score_map

# Sanity Checks

In [64]:
import os
import json
import pandas as pd

save_dir = "../data/embedding_mix/silver_v3"
files_to_check = ["hn_mapping.json", "teacher_mapping.json", "misconception_mapping.csv", "train.csv"]

for file in files_to_check:
    assert os.path.exists(os.path.join(save_dir, file)), f"{file} does not exist in {save_dir}"

with open(os.path.join(save_dir, "hn_mapping.json"), "r") as f:
    hn_mapping = json.load(f)

with open(os.path.join(save_dir, "teacher_mapping.json"), "r") as f:
    teacher_mapping = json.load(f)

misconception_df = pd.read_csv(os.path.join(save_dir, "misconception_mapping.csv"))
train_df = pd.read_csv(os.path.join(save_dir, "train.csv"))

assert len(hn_mapping) > 0, "hn_mapping is empty"
sample_key = next(iter(hn_mapping))
assert isinstance(hn_mapping[sample_key], list), "hn_mapping values should be lists"

assert len(teacher_mapping) > 0, "teacher_mapping is empty"
sample_key = next(iter(teacher_mapping))
assert isinstance(teacher_mapping[sample_key], (int, float)), "teacher_mapping values should be numeric"

assert not misconception_df.empty, "misconception_df is empty"
assert set(misconception_df.columns) == {"MisconceptionName", "MisconceptionId"}, "Unexpected columns in misconception_df"

assert not train_df.empty, "train_df is empty"
expected_columns = {
    'QuestionId', 'ConstructId', 'ConstructName', 'SubjectId', 'SubjectName',
    'CorrectAnswer', 'QuestionText', 'AnswerAText', 'AnswerBText', 'AnswerCText', 'AnswerDText',
    'MisconceptionAId', 'MisconceptionBId', 'MisconceptionCId', 'MisconceptionDId'
}
assert set(train_df.columns) == expected_columns, "Unexpected columns in train_df"

assert not train_df['QuestionId'].isna().any(), "NaN values found in QuestionId column"
assert not train_df['CorrectAnswer'].isna().any(), "NaN values found in CorrectAnswer column"

# Check data types
assert train_df['QuestionId'].dtype == 'int64', "QuestionId should be int64"
assert train_df['CorrectAnswer'].isin(['A', 'B', 'C', 'D']).all(), "CorrectAnswer should only contain A, B, C, or D"

# Check consistency between files
train_misconceptions = set(train_df['MisconceptionAId'].dropna()) | set(train_df['MisconceptionBId'].dropna()) | \
                       set(train_df['MisconceptionCId'].dropna()) | set(train_df['MisconceptionDId'].dropna())
mapping_misconceptions = set(misconception_df['MisconceptionId'])
assert train_misconceptions.issubset(mapping_misconceptions), "Misconceptions in train_df not found in misconception_df"

train_question_ids = set(train_df['QuestionId'])
hn_question_ids = set(int(key.split('_')[0]) for key in hn_mapping.keys())
assert train_question_ids.issubset(hn_question_ids), "Not all questions in train_df have entries in hn_mapping"

print("All sanity checks passed successfully!")

All sanity checks passed successfully!


In [65]:
len(train_question_ids.difference(set(hn_question_ids)))

0

In [66]:
# hn_mapping['1829_D']

In [67]:
# train_question_ids

In [68]:
assert train_df['QuestionId'].nunique() == len(train_df), "Duplicate QuestionIds found in train_df"

hn_misconceptions = set()
for misconceptions in hn_mapping.values():
    hn_misconceptions.update(misconceptions)
assert hn_misconceptions.issubset(set(misconception_df['MisconceptionId'])), "Some MisconceptionIds in hn_mapping not found in misconception_df"

assert all(len(key.split('|')) == 2 for key in teacher_mapping.keys()), "Unexpected format in teacher_mapping keys"

train_misconceptions = pd.concat([train_df[f'Misconception{letter}Id'].dropna() for letter in 'ABCD'])
assert set(train_misconceptions).issubset(set(misconception_df['MisconceptionId'])), "Some MisconceptionIds in train_df not found in misconception_df"

misconception_counts = train_df[['MisconceptionAId', 'MisconceptionBId', 'MisconceptionCId', 'MisconceptionDId']].notna().sum(axis=1)
assert misconception_counts.max() <= 3, "Some questions have more than 3 misconceptions"

for _, row in train_df.iterrows():
    correct_answer = row['CorrectAnswer']
    assert pd.isna(row[f'Misconception{correct_answer}Id']), f"Correct answer has a misconception for QuestionId {row['QuestionId']}"

print("All additional sanity checks passed successfully!")

All additional sanity checks passed successfully!


In [69]:
train_df.QuestionId.value_counts()

QuestionId
300000    1
15238     1
15245     1
15248     1
15251     1
         ..
401476    1
401478    1
401486    1
401490    1
1856      1
Name: count, Length: 10594, dtype: int64