Skip to content

Instantly share code, notes, and snippets.

@uchida
Last active March 22, 2024 11:19
Show Gist options
  • Select an option

  • Save uchida/7792ec17e152bcea1fea to your computer and use it in GitHub Desktop.

Select an option

Save uchida/7792ec17e152bcea1fea to your computer and use it in GitHub Desktop.

Revisions

  1. uchida revised this gist Aug 10, 2016. 1 changed file with 25 additions and 16 deletions.
    41 changes: 25 additions & 16 deletions arxiv2bib.py
    Original file line number Diff line number Diff line change
    @@ -1,9 +1,12 @@
    #!/usr/bin/env python
    #!/usr/bin/python
    # -*- coding: utf-8 -*-
    # CC0, dedicated to public domain by Akihiro Uchida
    import argparse
    import urllib2, os
    from HTMLParser import HTMLParser
    import re
    import calendar
    ARXIV_ID_RE = re.compile(r'arXiv:((\d\d)(\d\d)\.\d+)')

    class bibitem(object):
    def __init__(self, bibtype):
    @@ -93,36 +96,35 @@ def normalize(cls, dic):
    assert cls in dic
    value = dic[cls]
    result = dict()
    if cls == 'title':
    result['title'] = value.partition('Title:')[-1].strip('\n')
    if cls == 'title mathjax':
    result['title'] = value.strip('\n')
    elif cls == 'authors':
    result['author'] = ''
    for c in value.partition('Authors:')[-1].strip('\n'):
    for c in value.strip('\n'):
    if c == ',':
    result['author'] += ' and '
    else:
    result['author'] += c
    elif cls == 'dateline':
    date = value.strip('()').split()
    (result['year'], result['month']) = (date[4], date[3])
    elif cls == 'abstract':
    elif cls == 'abstract mathjax':
    parser = AbstParser()
    parser.feed(value.partition('Abstract:')[-1].strip())
    # todo
    result[cls] = parser.text
    parser.feed(value.strip())
    result['abstract'] = parser.text
    if cls.startswith('tablecell '):
    c = cls.partition('tablecell ')[-1]
    if c == 'arxivid':
    result['eprint'] = value[:15]
    result['url'] = 'http://arxiv.org/abs/{}'.format(value[6:15])
    result['eprint'] = value
    m = ARXIV_ID_RE.match(value)
    result['url'] = 'http://arxiv.org/abs/{}'.format(m.group(1))
    result['year'] = '20{}'.format(m.group(2))
    result['month'] = calendar.month_abbr[int(m.group(3))]
    elif c == 'doi':
    result[c] = value
    result['doi-url'] = 'http://dx.doi.org/{}'.format(value)
    else:
    result[c] = dic[cls]
    return result

    some_classes = ('title', 'authors', 'dateline', 'abstract',
    some_classes = ('title mathjax', 'authors', 'abstract mathjax',
    'tablecell comments', 'tablecell arxivid', 'tablecell subjects',
    'tablecell jref', 'tablecell doi', 'tablecell report-number',
    'tablecell msc-classes', 'tablecell acm-classes')
    @@ -132,23 +134,30 @@ def __init__(self):
    HTMLParser.__init__(self)
    self.item = bibitem('misc')
    self.stack = []
    self.in_descriptor = False
    self.tmp = dict()
    return

    def handle_starttag(self, tag, attrs):
    for attr in attrs:
    if attr[1] in some_classes:
    self.stack.append({'tag': tag, 'class':attr[1]})
    self.stack.append({'tag': tag, 'class': attr[1]})
    if attr[1] == "descriptor":
    self.in_descriptor = True
    return

    def handle_endtag(self, tag):
    if self.in_descriptor and tag == "span":
    self.in_descriptor = False
    if self.stack != [] and tag == self.stack[-1]['tag']:
    s = self.stack.pop()
    self.item.add(normalize(s['class'], self.tmp))
    return

    def handle_data(self, data):
    for c in some_classes:
    if self.in_descriptor:
    continue
    if self.stack != [] and self.stack[-1]['class'] == c:
    self.tmp[c] = self.tmp.get(c, '') + data
    return
    @@ -173,4 +182,4 @@ def handle_data(self, data):
    fflag = 'a' if os.path.exists(fpath) else 'w'
    with open(fpath, fflag) as f:
    f.write(parser.item.dump())
    parser.close()
    parser.close()
  2. uchida revised this gist Jan 31, 2016. 1 changed file with 6 additions and 6 deletions.
    12 changes: 6 additions & 6 deletions arxiv2bib.py
    Original file line number Diff line number Diff line change
    @@ -122,10 +122,10 @@ def normalize(cls, dic):
    result[c] = dic[cls]
    return result

    some_class = ('title', 'authors', 'dateline', 'abstract',
    'tablecell comments', 'tablecell arxivid', 'tablecell subjects',
    'tablecell jref', 'tablecell doi', 'tablecell report-number',
    'tablecell msc-classes', 'tablecell acm-classes')
    some_classes = ('title', 'authors', 'dateline', 'abstract',
    'tablecell comments', 'tablecell arxivid', 'tablecell subjects',
    'tablecell jref', 'tablecell doi', 'tablecell report-number',
    'tablecell msc-classes', 'tablecell acm-classes')

    class MyHTMLParser(HTMLParser):
    def __init__(self):
    @@ -137,7 +137,7 @@ def __init__(self):

    def handle_starttag(self, tag, attrs):
    for attr in attrs:
    if attr[1] in some_class:
    if attr[1] in some_classes:
    self.stack.append({'tag': tag, 'class':attr[1]})
    return

    @@ -148,7 +148,7 @@ def handle_endtag(self, tag):
    return

    def handle_data(self, data):
    for c in some_class:
    for c in some_classes:
    if self.stack != [] and self.stack[-1]['class'] == c:
    self.tmp[c] = self.tmp.get(c, '') + data
    return
  3. uchida created this gist Jan 31, 2016.
    176 changes: 176 additions & 0 deletions arxiv2bib.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,176 @@
    #!/usr/bin/env python
    # -*- coding: utf-8 -*-
    # CC0, dedicated to public domain by Akihiro Uchida
    import argparse
    import urllib2, os
    from HTMLParser import HTMLParser

    class bibitem(object):
    def __init__(self, bibtype):
    assert isinstance(bibtype, str)
    self.bibtype = bibtype
    self.field = dict()
    return

    def add(self, dic):
    assert isinstance(dic, dict)
    for k, v in dic.iteritems():
    self.field[k] = self.field.get(k, '') + v
    return

    def gen_key(self):
    key = ''
    if 'year' in self.field:
    key += self.field['year']
    if 'author' in self.field:
    authors = self.field['author'].split('and')
    for author in authors:
    cnt = 0
    for w in author.split():
    if cnt < len(w):
    (cnt, name) = (len(w), w.strip(',.'))
    key += name
    if 'title' in self.field:
    for w in self.field['title'].split():
    key += w.title()
    if len(w) > 4:
    break
    return key

    def dump(self):
    d = '@{}{{{}'.format(self.bibtype, self.gen_key())
    for k, v in self.field.iteritems():
    if v not in ['', None]:
    d += ',\n{}={{{}}}'.format(k, v)
    d += '}\n'
    return d

    class AbstParser(object):
    def __init__(self):
    self.parse = self.parse_main
    self.text = ''
    return

    def feed(self, text):
    i = 0
    while i < len(text):
    (self.parse, i) = self.parse(text, i)
    return

    def parse_main(self, text, i):
    c = text[i]
    if c == '"':
    self.text += '``'
    return (self.parse_quote, i+1)
    if c == '-':
    return (self.parse_hyphen, i+1)
    else:
    if c == '\n':
    self.text += ' '
    else:
    self.text += c
    return (self.parse_main, i+1)

    def parse_quote(self, text, i):
    c = text[i]
    if c == '"':
    self.text += '\'\''
    return (self.parse_main, i+1)
    else:
    if c == '\n':
    self.text += ' '
    else:
    self.text += c
    return (self.parse_quote, i+1)

    def parse_hyphen(self, text, i):
    c = text[i]
    if c not in (' ', '\n'):
    self.text += '-'
    return (self.parse_main, i+1)

    def normalize(cls, dic):
    assert cls in dic
    value = dic[cls]
    result = dict()
    if cls == 'title':
    result['title'] = value.partition('Title:')[-1].strip('\n')
    elif cls == 'authors':
    result['author'] = ''
    for c in value.partition('Authors:')[-1].strip('\n'):
    if c == ',':
    result['author'] += ' and '
    else:
    result['author'] += c
    elif cls == 'dateline':
    date = value.strip('()').split()
    (result['year'], result['month']) = (date[4], date[3])
    elif cls == 'abstract':
    parser = AbstParser()
    parser.feed(value.partition('Abstract:')[-1].strip())
    # todo
    result[cls] = parser.text
    if cls.startswith('tablecell '):
    c = cls.partition('tablecell ')[-1]
    if c == 'arxivid':
    result['eprint'] = value[:15]
    result['url'] = 'http://arxiv.org/abs/{}'.format(value[6:15])
    elif c == 'doi':
    result[c] = value
    result['doi-url'] = 'http://dx.doi.org/{}'.format(value)
    else:
    result[c] = dic[cls]
    return result

    some_class = ('title', 'authors', 'dateline', 'abstract',
    'tablecell comments', 'tablecell arxivid', 'tablecell subjects',
    'tablecell jref', 'tablecell doi', 'tablecell report-number',
    'tablecell msc-classes', 'tablecell acm-classes')

    class MyHTMLParser(HTMLParser):
    def __init__(self):
    HTMLParser.__init__(self)
    self.item = bibitem('misc')
    self.stack = []
    self.tmp = dict()
    return

    def handle_starttag(self, tag, attrs):
    for attr in attrs:
    if attr[1] in some_class:
    self.stack.append({'tag': tag, 'class':attr[1]})
    return

    def handle_endtag(self, tag):
    if self.stack != [] and tag == self.stack[-1]['tag']:
    s = self.stack.pop()
    self.item.add(normalize(s['class'], self.tmp))
    return

    def handle_data(self, data):
    for c in some_class:
    if self.stack != [] and self.stack[-1]['class'] == c:
    self.tmp[c] = self.tmp.get(c, '') + data
    return

    if __name__ == '__main__':
    try:
    proxy = {'http': os.environ['http_proxy']}
    except KeyError, e:
    proxy = {}
    handler = urllib2.ProxyHandler(proxy)
    opener = urllib2.build_opener(handler)
    arg_parser = argparse.ArgumentParser()
    arg_parser.add_argument('url', type=str)
    arg_parser.add_argument('-f', '--file', type=str,
    required=True)
    args = arg_parser.parse_args()
    parser = MyHTMLParser()
    response = opener.open(args.url)
    parser.feed(response.read())
    response.close()
    fpath = os.path.abspath(args.file)
    fflag = 'a' if os.path.exists(fpath) else 'w'
    with open(fpath, fflag) as f:
    f.write(parser.item.dump())
    parser.close()