# extract zipped csv file to ~/osopennames cd osopennames # merge all csvs into one, with header first for f in DATA/*.csv ; do cat $f; done > mergedwithoutheader.csv # view top 5 or so lines to make sure it's what you expect head -5 mergedwithoutheader.csv # open in a text editor and remove any " # strip it down to just the NAME1 column cut -f3 --delimiter=, mergedwithoutheader.csv > mergednameonly.csv # convert to lower case tr '[:upper:]' '[:lower:]' < mergednameonly.csv > mergednameonlylowercase.csv # convert spaces to hyphens sed -i -e 's/ /-/g' mergednameonlylowercase.csv # duplicate the column awk 'BEGIN{FS=OFS=", "}{$1 = $1 OFS $1} 1' mergednameonlylowercase.csv > mergednameonlydupe.csv # convert to json (yes I'm sure this can be piped together, don't @ me) # what we're aiming for is: # {"label": "GPE", "pattern": [{"LOWER": "san"}, {"LOWER": "francisco"}], "id": "san-francisco"} # 1) globally replace inner space(s) with "}, {"LOWER": " sed -e 's/ /"}, {"LOWER": "/g' mergednameonlydupe.csv > mergednameonly.json # 2) prefix {"label": "B-Geo", "pattern": [{"LOWER": at the start sed -i -e 's/^/{"label": "B-Geo", "pattern": [{"LOWER": "/' mergednameonly.json # 3) replace the , "id element with "}], "id sed -i -e "s/,\"id\"/\"}], \"id\"/g" mergednameonlyquoted.json # 4) Add a closing } at the end sed -i -e "s/$/}/g" mergednameonly.json # rename to jsonl as that's what spacy is looking for mv mergednameonly.json mergednameonly.jsonl