{ "cells": [ { "cell_type": "code", "execution_count": 120, "metadata": {}, "outputs": [], "source": [ "import json\n", "import gzip\n", "import random\n", "from pprint import pprint\n" ] }, { "cell_type": "code", "execution_count": 121, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Reading the dataset ...\n", "Adding http://www.jstor.org/stable/10.1086/491498 to sample\n", "Adding http://www.jstor.org/stable/10.1086/432295 to sample\n", "Adding http://www.jstor.org/stable/10.1086/379413 to sample\n", "Adding http://www.jstor.org/stable/228664 to sample\n", "Adding http://www.jstor.org/stable/236768 to sample\n", "Adding http://www.jstor.org/stable/227706 to sample\n", "Adding http://www.jstor.org/stable/231357 to sample\n", "Adding http://www.jstor.org/stable/3080697 to sample\n", "Adding http://www.jstor.org/stable/229231 to sample\n", "Adding http://www.jstor.org/stable/230556 to sample\n", "Adding http://www.jstor.org/stable/10.1086/670902 to sample\n", "Adding http://www.jstor.org/stable/228263 to sample\n", "Adding http://www.jstor.org/stable/229843 to sample\n", "Adding http://www.jstor.org/stable/10.1086/678012 to sample\n", "Adding http://www.jstor.org/stable/230061 to sample\n", "Adding http://www.jstor.org/stable/10.1086/376025 to sample\n", "Adding http://www.jstor.org/stable/10.1086/653929 to sample\n", "Adding http://www.jstor.org/stable/226119 to sample\n", "Adding http://www.jstor.org/stable/10.1086/491505 to sample\n", "Adding http://www.jstor.org/stable/235887 to sample\n", "Adding http://www.jstor.org/stable/10.1086/682793 to sample\n", "Adding http://www.jstor.org/stable/227572 to sample\n", "Adding http://www.jstor.org/stable/10.1086/386402 to sample\n", "Adding http://www.jstor.org/stable/223695 to sample\n", "Adding http://www.jstor.org/stable/235969 to sample\n", "Dataset reading complete. 25 total documents.\n" ] } ], "source": [ "sample_doc_numbers = random.sample(range(0, 19000), 25)\n", "sample_docs = []\n", "\n", "print(\"Reading the dataset ...\")\n", "\n", "with gzip.open(\"./datasets/dset1.jsonl.gz\", \"rb\") as inf:\n", " for row_num, row in enumerate(inf):\n", " doc = json.loads(row)\n", " if row_num not in sample_doc_numbers:\n", " continue\n", " print(f\"Adding {doc['id']} to sample\")\n", " sample_docs.append(doc)\n", "\n", "print(f\"Dataset reading complete. {len(sample_docs)} total documents.\")" ] }, { "cell_type": "code", "execution_count": 122, "metadata": {}, "outputs": [], "source": [ "doc1 = sample_docs[0]" ] }, { "cell_type": "code", "execution_count": 123, "metadata": {}, "outputs": [], "source": [ "to_delete = [\"unigramCount\", \"bigramCount\", \"trigramCount\", \"fullText\"]\n", "for k in to_delete:\n", " if k in doc1.keys():\n", " del doc1[k]" ] }, { "cell_type": "code", "execution_count": 124, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'creator': ['Joan‐Pau Rubiés'],\n", " 'datePublished': '2005-06-01',\n", " 'docType': 'article',\n", " 'id': 'http://www.jstor.org/stable/10.1086/491498',\n", " 'identifier': [{'name': 'issn', 'value': '00211753'},\n", " {'name': 'oclc', 'value': '49976319'},\n", " {'name': 'lccn', 'value': '2002-227035'},\n", " {'name': 'local_uuid',\n", " 'value': 'd22c16bb-d068-3bdf-9962-8d0db608891e'},\n", " {'name': 'local_doi', 'value': '10.1086/491498'},\n", " {'name': 'journal_id', 'value': 'isis'}],\n", " 'isPartOf': 'Isis',\n", " 'issueNumber': '2',\n", " 'language': ['eng'],\n", " 'outputFormat': ['unigram', 'bigram', 'trigram'],\n", " 'pageCount': 2,\n", " 'pageEnd': '276',\n", " 'pageStart': '275',\n", " 'pagination': 'pp. 275-276',\n", " 'provider': 'jstor',\n", " 'publicationYear': 2005,\n", " 'publisher': 'The University of Chicago Press',\n", " 'sourceCategory': ['History of Science & Technology', 'History'],\n", " 'title': 'Review Article',\n", " 'url': 'http://www.jstor.org/stable/10.1086/491498',\n", " 'volumeNumber': '96',\n", " 'wordCount': 1051}\n" ] } ], "source": [ "pprint(doc1)" ] }, { "cell_type": "code", "execution_count": 125, "metadata": {}, "outputs": [], "source": [ "fields_to_keep = [\n", " \"id\",\n", " \"title\",\n", " \"isPartOf\",\n", " \"publicationYear\",\n", " \"creator\",\n", " \"wordCount\",\n", " \"provider\",\n", " \"url\"\n", "]\n", "filtered_sample_docs = []\n", "for doc in sample_docs:\n", " new_doc = {}\n", " for f in fields_to_keep:\n", " value = doc.get(f)\n", " new_doc[f] = value\n", " filtered_sample_docs.append(new_doc)" ] }, { "cell_type": "code", "execution_count": 126, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'creator': ['Joan‐Pau Rubiés'],\n", " 'id': 'http://www.jstor.org/stable/10.1086/491498',\n", " 'isPartOf': 'Isis',\n", " 'provider': 'jstor',\n", " 'publicationYear': 2005,\n", " 'title': 'Review Article',\n", " 'url': 'http://www.jstor.org/stable/10.1086/491498',\n", " 'wordCount': 1051}\n" ] } ], "source": [ "pprint(filtered_sample_docs[0])" ] }, { "cell_type": "code", "execution_count": 127, "metadata": {}, "outputs": [], "source": [ "with open(\"datasets/filtered_dset1.json\", \"w\") as of:\n", " json.dump(filtered_sample_docs, of)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.6.6" } }, "nbformat": 4, "nbformat_minor": 2 }