import functools import pandas as pd import torch import transformers from accelerate import Accelerator from datasets import Dataset from torch.utils.data import DataLoader from tqdm.auto import tqdm from t5_training_utils import ( GenerationType, build_prefix_allowed_tokens_fn, convert_to_features, get_gen_type_attributes, get_model_full_name, get_prediction_name, ) torch_dtype = "auto" model_ckpt = "t5-base" gen_type = GenerationType.ALL_TOKENS input_max_length = 512 label_max_length = 6 use_task_prefix = True class_names = [ "Soccer", "Cricket", "Handball", "Snow Cycling", ] non_eligible_classes = { "Snow Cycling" } non_eligible_idx = [ i for i, c in enumerate(class_names) if c in non_eligible_classes ] num_classes = len(class_names) # Model training ### Uncomment a config section for the model type ## For small test run train_batch_size = 8 eval_batch_size = 8 epochs = 30 save_every_k_epochs = 5 seed = 3333 torch.manual_seed(seed) logging_steps = 100 # len(squad["train"]) // batch_size eval_step = 100 learning_rate = 2e-5 weight_decay = 0.01 data_version = "guidelines-fixed-occasion" model_full_name = get_model_full_name(model_ckpt, gen_type, epochs, data_version) def get_model(model_local_ckpt): model = transformers.AutoModelForSeq2SeqLM.from_pretrained(model_local_ckpt) return model.eval() def get_dataset( data_path, tokenizer, class_text_map, task_prefix, accelerator, nrows=1024, offset=0 ): df_data = pd.read_csv(data_path, sep="\t", nrows=nrows + offset).rename( columns={"query": "text"} ) df_data = df_data.iloc[offset : offset + nrows] print(df_data) dataset = Dataset.from_pandas(df_data) dataset.reset_format() with accelerator.main_process_first(): dataset = dataset.map( functools.partial( convert_to_features, class_text_map=class_text_map, task_prefix=task_prefix, query_key="text", label_key=None, tokenizer=tokenizer, ) ) return dataset, df_data def get_predictions_accelerate(data_path, model_local_ckpt, nrows=1024, offset=0): accelerator = Accelerator() device = accelerator.device tokenizer = transformers.AutoTokenizer.from_pretrained(model_ckpt) class_text_map, max_decoding_length, task_prefix = get_gen_type_attributes( gen_type, tokenizer, class_names ) task_prefix = task_prefix if use_task_prefix else "" dataset, df_data = get_dataset( data_path, tokenizer, class_text_map, task_prefix, accelerator, nrows=nrows, offset=offset, ) model = get_model(model_local_ckpt) model = model.to(device) allowed_sequences = [[0] + tokenizer.encode(x) for x in class_text_map.values()] dataset.set_format("pt") custom_dataloader = DataLoader( dataset, shuffle=True, batch_size=eval_batch_size, num_workers=4 ) model, custom_dataloader = accelerator.prepare(model, custom_dataloader) preds = [] with torch.no_grad(): for batch in tqdm( custom_dataloader, disable=not accelerator.is_local_main_process ): batch_input_ids = batch["input_ids"].to(device) batch_attention_mask = batch["attention_mask"].to(device) # For DDP models use accelerator.unwrap_model(model).generate(inputs) # Taken from: https://github.com/huggingface/transformers/issues/18974 batch_outs = accelerator.unwrap_model(model).generate( input_ids=batch_input_ids, attention_mask=batch_attention_mask, max_length=max_decoding_length, prefix_allowed_tokens_fn=build_prefix_allowed_tokens_fn( allowed_sequences ), ) batch_outs = accelerator.pad_across_processes( batch_outs, dim=1, pad_index=tokenizer.pad_token_id ) batch_outs = accelerator.gather_for_metrics(batch_outs).cpu().numpy() preds.extend(tokenizer.batch_decode(batch_outs, skip_special_tokens=True)) accelerator.wait_for_everyone() if accelerator.is_main_process: if len(preds) != len(dataset): raise ValueError( f"Predictions and labels have different lengths. preds: {len(preds)} " f"labels: {len(dataset)}" ) pred_col = get_prediction_name(model_full_name) df_data[pred_col] = preds class_text_map_reversed = {val: key for key, val in class_text_map.items()} df_data[pred_col] = df_data[pred_col].apply(lambda x: class_text_map_reversed[x]) # eligible = ~df_test[pred_col].isin(non_eligible_classes) eligible = ~df_data[pred_col].isin( {v for v in non_eligible_classes if v != "Cricket"} ) df_data["eligible_pred"] = eligible output_path = data_path.replace(".tsv", f".predicted.{offset}.{nrows}.tsv") print(df_data) print(f"Writing df_data with predictions to {output_path}") df_data.to_csv(output_path, sep="\t", index=False) return df_data def main(): data_path = "data.tsv" offset = 400_000 nrows = 153 # 600_000 model_local_ckpt = "./model_path/checkpoint-2830" print(data_path) print(nrows) print(model_local_ckpt) get_predictions_accelerate(data_path, model_local_ckpt, nrows=nrows, offset=offset) if __name__ == "__main__": main()