Created
December 27, 2020 06:46
-
-
Save lppier/0f10d3a9d13c76c24f65a77b3d02b76a to your computer and use it in GitHub Desktop.
Revisions
-
lppier created this gist
Dec 27, 2020 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,114 @@ from pathlib import Path from sklearn.model_selection import train_test_split from transformers import DistilBertTokenizerFast import torch from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments import torch.nn.functional as F from sklearn.metrics import accuracy_score, precision_recall_fscore_support # IMDB Dataset can be found here # wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz # tar -xf aclImdb_v1.tar.gz def compute_metrics(pred): labels = pred.label_ids preds = pred.predictions.argmax(-1) precision, recall, f1, _ = precision_recall_fscore_support( labels, preds, average="binary" ) acc = accuracy_score(labels, preds) return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall} def read_imdb_split(split_dir): split_dir = Path(split_dir) texts = [] labels = [] for label_dir in ["pos", "neg"]: for text_file in (split_dir / label_dir).iterdir(): texts.append(text_file.read_text(encoding="utf8")) labels.append(0 if label_dir is "neg" else 1) return texts, labels train_texts, train_labels = read_imdb_split("data/aclImdb/train") test_texts, test_labels = read_imdb_split("data/aclImdb/test") # Further split training set to get a validation set train_texts, val_texts, train_labels, val_labels = train_test_split( train_texts, train_labels, test_size=0.1 ) # Get BERT Tokens tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased") train_encodings = tokenizer(train_texts, truncation=True, padding=True) val_encodings = tokenizer(val_texts, truncation=True, padding=True) test_encodings = tokenizer(test_texts, truncation=True, padding=True) class IMDbDataset(torch.utils.data.Dataset): def __init__(self, encodings, labels): self.encodings = encodings self.labels = labels def __getitem__(self, idx): item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()} item["labels"] = torch.tensor(self.labels[idx]) return item def __len__(self): return len(self.labels) train_dataset = IMDbDataset(train_encodings, train_labels) val_dataset = IMDbDataset(val_encodings, val_labels) test_dataset = IMDbDataset(test_encodings, test_labels) training_args = TrainingArguments( output_dir="./results", # output directory num_train_epochs=1, # total number of training epochs per_device_train_batch_size=16, # batch size per device during training per_device_eval_batch_size=64, # batch size for evaluation warmup_steps=500, # number of warmup steps for learning rate scheduler weight_decay=0.01, # strength of weight decay logging_dir="./logs", # directory for storing logs logging_steps=10, ) model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased") trainer = Trainer( model=model, # the instantiated 🤗 Transformers model to be trained args=training_args, # training arguments, defined above train_dataset=train_dataset, # training dataset eval_dataset=val_dataset, # evaluation dataset compute_metrics=compute_metrics, ) trainer.train() # this saves the models in the checkpoints under results folder # Inference Code import pandas as pd model = DistilBertForSequenceClassification.from_pretrained("./results_old/checkpoint-3500") df_labels = pd.read_csv("data/comprehendimdbtest.csv", header=None) test_data = df_labels.iloc[:,1].to_list() predictions = [] # Need this loop otherwise my cpu memory just loads up for text in test_data: encoding = tokenizer(text, return_tensors="pt", padding=True, truncation=True) outputs = model(**encoding) pt_predictions = F.softmax(outputs[0], dim=-1) predictions.append(pt_predictions.argmax(-1).item()) # warning, don't append the tensor too memory intensive! print(pt_predictions) precision, recall, f1, _ = precision_recall_fscore_support( df_labels.iloc[:, 0], predictions, average="binary" ) acc = accuracy_score(df_labels.iloc[:, 0], predictions) metrics = {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall} print(metrics)