Last active
March 22, 2024 11:19
-
-
Save uchida/7792ec17e152bcea1fea to your computer and use it in GitHub Desktop.
Revisions
-
uchida revised this gist
Aug 10, 2016 . 1 changed file with 25 additions and 16 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,9 +1,12 @@ #!/usr/bin/python # -*- coding: utf-8 -*- # CC0, dedicated to public domain by Akihiro Uchida import argparse import urllib2, os from HTMLParser import HTMLParser import re import calendar ARXIV_ID_RE = re.compile(r'arXiv:((\d\d)(\d\d)\.\d+)') class bibitem(object): def __init__(self, bibtype): @@ -93,36 +96,35 @@ def normalize(cls, dic): assert cls in dic value = dic[cls] result = dict() if cls == 'title mathjax': result['title'] = value.strip('\n') elif cls == 'authors': result['author'] = '' for c in value.strip('\n'): if c == ',': result['author'] += ' and ' else: result['author'] += c elif cls == 'abstract mathjax': parser = AbstParser() parser.feed(value.strip()) result['abstract'] = parser.text if cls.startswith('tablecell '): c = cls.partition('tablecell ')[-1] if c == 'arxivid': result['eprint'] = value m = ARXIV_ID_RE.match(value) result['url'] = 'http://arxiv.org/abs/{}'.format(m.group(1)) result['year'] = '20{}'.format(m.group(2)) result['month'] = calendar.month_abbr[int(m.group(3))] elif c == 'doi': result[c] = value result['doi-url'] = 'http://dx.doi.org/{}'.format(value) else: result[c] = dic[cls] return result some_classes = ('title mathjax', 'authors', 'abstract mathjax', 'tablecell comments', 'tablecell arxivid', 'tablecell subjects', 'tablecell jref', 'tablecell doi', 'tablecell report-number', 'tablecell msc-classes', 'tablecell acm-classes') @@ -132,23 +134,30 @@ def __init__(self): HTMLParser.__init__(self) self.item = bibitem('misc') self.stack = [] self.in_descriptor = False self.tmp = dict() return def handle_starttag(self, tag, attrs): for attr in attrs: if attr[1] in some_classes: self.stack.append({'tag': tag, 'class': attr[1]}) if attr[1] == "descriptor": self.in_descriptor = True return def handle_endtag(self, tag): if self.in_descriptor and tag == "span": self.in_descriptor = False if self.stack != [] and tag == self.stack[-1]['tag']: s = self.stack.pop() self.item.add(normalize(s['class'], self.tmp)) return def handle_data(self, data): for c in some_classes: if self.in_descriptor: continue if self.stack != [] and self.stack[-1]['class'] == c: self.tmp[c] = self.tmp.get(c, '') + data return @@ -173,4 +182,4 @@ def handle_data(self, data): fflag = 'a' if os.path.exists(fpath) else 'w' with open(fpath, fflag) as f: f.write(parser.item.dump()) parser.close() -
uchida revised this gist
Jan 31, 2016 . 1 changed file with 6 additions and 6 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -122,10 +122,10 @@ def normalize(cls, dic): result[c] = dic[cls] return result some_classes = ('title', 'authors', 'dateline', 'abstract', 'tablecell comments', 'tablecell arxivid', 'tablecell subjects', 'tablecell jref', 'tablecell doi', 'tablecell report-number', 'tablecell msc-classes', 'tablecell acm-classes') class MyHTMLParser(HTMLParser): def __init__(self): @@ -137,7 +137,7 @@ def __init__(self): def handle_starttag(self, tag, attrs): for attr in attrs: if attr[1] in some_classes: self.stack.append({'tag': tag, 'class':attr[1]}) return @@ -148,7 +148,7 @@ def handle_endtag(self, tag): return def handle_data(self, data): for c in some_classes: if self.stack != [] and self.stack[-1]['class'] == c: self.tmp[c] = self.tmp.get(c, '') + data return -
uchida created this gist
Jan 31, 2016 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,176 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- # CC0, dedicated to public domain by Akihiro Uchida import argparse import urllib2, os from HTMLParser import HTMLParser class bibitem(object): def __init__(self, bibtype): assert isinstance(bibtype, str) self.bibtype = bibtype self.field = dict() return def add(self, dic): assert isinstance(dic, dict) for k, v in dic.iteritems(): self.field[k] = self.field.get(k, '') + v return def gen_key(self): key = '' if 'year' in self.field: key += self.field['year'] if 'author' in self.field: authors = self.field['author'].split('and') for author in authors: cnt = 0 for w in author.split(): if cnt < len(w): (cnt, name) = (len(w), w.strip(',.')) key += name if 'title' in self.field: for w in self.field['title'].split(): key += w.title() if len(w) > 4: break return key def dump(self): d = '@{}{{{}'.format(self.bibtype, self.gen_key()) for k, v in self.field.iteritems(): if v not in ['', None]: d += ',\n{}={{{}}}'.format(k, v) d += '}\n' return d class AbstParser(object): def __init__(self): self.parse = self.parse_main self.text = '' return def feed(self, text): i = 0 while i < len(text): (self.parse, i) = self.parse(text, i) return def parse_main(self, text, i): c = text[i] if c == '"': self.text += '``' return (self.parse_quote, i+1) if c == '-': return (self.parse_hyphen, i+1) else: if c == '\n': self.text += ' ' else: self.text += c return (self.parse_main, i+1) def parse_quote(self, text, i): c = text[i] if c == '"': self.text += '\'\'' return (self.parse_main, i+1) else: if c == '\n': self.text += ' ' else: self.text += c return (self.parse_quote, i+1) def parse_hyphen(self, text, i): c = text[i] if c not in (' ', '\n'): self.text += '-' return (self.parse_main, i+1) def normalize(cls, dic): assert cls in dic value = dic[cls] result = dict() if cls == 'title': result['title'] = value.partition('Title:')[-1].strip('\n') elif cls == 'authors': result['author'] = '' for c in value.partition('Authors:')[-1].strip('\n'): if c == ',': result['author'] += ' and ' else: result['author'] += c elif cls == 'dateline': date = value.strip('()').split() (result['year'], result['month']) = (date[4], date[3]) elif cls == 'abstract': parser = AbstParser() parser.feed(value.partition('Abstract:')[-1].strip()) # todo result[cls] = parser.text if cls.startswith('tablecell '): c = cls.partition('tablecell ')[-1] if c == 'arxivid': result['eprint'] = value[:15] result['url'] = 'http://arxiv.org/abs/{}'.format(value[6:15]) elif c == 'doi': result[c] = value result['doi-url'] = 'http://dx.doi.org/{}'.format(value) else: result[c] = dic[cls] return result some_class = ('title', 'authors', 'dateline', 'abstract', 'tablecell comments', 'tablecell arxivid', 'tablecell subjects', 'tablecell jref', 'tablecell doi', 'tablecell report-number', 'tablecell msc-classes', 'tablecell acm-classes') class MyHTMLParser(HTMLParser): def __init__(self): HTMLParser.__init__(self) self.item = bibitem('misc') self.stack = [] self.tmp = dict() return def handle_starttag(self, tag, attrs): for attr in attrs: if attr[1] in some_class: self.stack.append({'tag': tag, 'class':attr[1]}) return def handle_endtag(self, tag): if self.stack != [] and tag == self.stack[-1]['tag']: s = self.stack.pop() self.item.add(normalize(s['class'], self.tmp)) return def handle_data(self, data): for c in some_class: if self.stack != [] and self.stack[-1]['class'] == c: self.tmp[c] = self.tmp.get(c, '') + data return if __name__ == '__main__': try: proxy = {'http': os.environ['http_proxy']} except KeyError, e: proxy = {} handler = urllib2.ProxyHandler(proxy) opener = urllib2.build_opener(handler) arg_parser = argparse.ArgumentParser() arg_parser.add_argument('url', type=str) arg_parser.add_argument('-f', '--file', type=str, required=True) args = arg_parser.parse_args() parser = MyHTMLParser() response = opener.open(args.url) parser.feed(response.read()) response.close() fpath = os.path.abspath(args.file) fflag = 'a' if os.path.exists(fpath) else 'w' with open(fpath, fflag) as f: f.write(parser.item.dump()) parser.close()