uchida · March 22, 2024 11:19 · Aug 10, 2016 · Jan 31, 2016 · Jan 31, 2016
diff --git a/arxiv2bib.py b/arxiv2bib.py
@@ -1,9 +1,12 @@
-#!/usr/bin/env python
+#!/usr/bin/python
 # -*- coding: utf-8 -*-
 # CC0, dedicated to public domain by Akihiro Uchida
 import argparse
 import urllib2, os
 from HTMLParser import HTMLParser
+import re
+import calendar
+ARXIV_ID_RE = re.compile(r'arXiv:((\d\d)(\d\d)\.\d+)')
 
 class bibitem(object):
     def __init__(self, bibtype):
@@ -93,36 +96,35 @@ def normalize(cls, dic):
     assert cls in dic
     value = dic[cls]
     result = dict()
-    if cls == 'title':
-        result['title'] = value.partition('Title:')[-1].strip('\n')
+    if cls == 'title mathjax':
+        result['title'] = value.strip('\n')
     elif cls == 'authors':
         result['author'] = ''
-        for c in value.partition('Authors:')[-1].strip('\n'):
+        for c in value.strip('\n'):
             if c == ',':
                 result['author'] += ' and '
             else:
                 result['author'] += c
-    elif cls == 'dateline':
-        date = value.strip('()').split()
-        (result['year'], result['month']) = (date[4], date[3])
-    elif cls == 'abstract':
+    elif cls == 'abstract mathjax':
         parser = AbstParser()
-        parser.feed(value.partition('Abstract:')[-1].strip())
-    # todo
-        result[cls] = parser.text
+        parser.feed(value.strip())
+        result['abstract'] = parser.text
     if cls.startswith('tablecell '):
         c = cls.partition('tablecell ')[-1]
         if c == 'arxivid':
-            result['eprint'] = value[:15]
-            result['url'] = 'http://arxiv.org/abs/{}'.format(value[6:15])
+            result['eprint'] = value
+            m = ARXIV_ID_RE.match(value)
+            result['url'] = 'http://arxiv.org/abs/{}'.format(m.group(1))
+            result['year'] = '20{}'.format(m.group(2))
+            result['month'] = calendar.month_abbr[int(m.group(3))]
         elif c == 'doi':
             result[c] = value
             result['doi-url'] = 'http://dx.doi.org/{}'.format(value)
         else:
             result[c] = dic[cls]
     return result
 
-some_classes = ('title', 'authors', 'dateline', 'abstract',
+some_classes = ('title mathjax', 'authors', 'abstract mathjax',
                 'tablecell comments', 'tablecell arxivid', 'tablecell subjects',
                 'tablecell jref', 'tablecell doi', 'tablecell report-number',
                 'tablecell msc-classes', 'tablecell acm-classes')
@@ -132,23 +134,30 @@ def __init__(self):
         HTMLParser.__init__(self)
         self.item = bibitem('misc')
         self.stack = []
+        self.in_descriptor = False
         self.tmp = dict()
         return
 
     def handle_starttag(self, tag, attrs):
         for attr in attrs:
             if attr[1] in some_classes:
-                self.stack.append({'tag': tag, 'class':attr[1]})
+                self.stack.append({'tag': tag, 'class': attr[1]})
+            if attr[1] == "descriptor":
+                self.in_descriptor = True
         return
 
     def handle_endtag(self, tag):
+        if self.in_descriptor and tag == "span":
+            self.in_descriptor = False
         if self.stack != [] and tag == self.stack[-1]['tag']:
             s = self.stack.pop()
             self.item.add(normalize(s['class'], self.tmp))
         return
 
     def handle_data(self, data):
         for c in some_classes:
+            if self.in_descriptor:
+                continue
             if self.stack != [] and self.stack[-1]['class'] == c:
                 self.tmp[c] = self.tmp.get(c, '') + data
         return
@@ -173,4 +182,4 @@ def handle_data(self, data):
     fflag = 'a' if os.path.exists(fpath) else 'w'
     with open(fpath, fflag) as f:
         f.write(parser.item.dump())
-    parser.close()
+    parser.close()
diff --git a/arxiv2bib.py b/arxiv2bib.py
@@ -122,10 +122,10 @@ def normalize(cls, dic):
             result[c] = dic[cls]
     return result
 
-some_class = ('title', 'authors', 'dateline', 'abstract',
-              'tablecell comments', 'tablecell arxivid', 'tablecell subjects',
-              'tablecell jref', 'tablecell doi', 'tablecell report-number',
-              'tablecell msc-classes', 'tablecell acm-classes')
+some_classes = ('title', 'authors', 'dateline', 'abstract',
+                'tablecell comments', 'tablecell arxivid', 'tablecell subjects',
+                'tablecell jref', 'tablecell doi', 'tablecell report-number',
+                'tablecell msc-classes', 'tablecell acm-classes')
 
 class MyHTMLParser(HTMLParser):
     def __init__(self):
@@ -137,7 +137,7 @@ def __init__(self):
 
     def handle_starttag(self, tag, attrs):
         for attr in attrs:
-            if attr[1] in some_class:
+            if attr[1] in some_classes:
                 self.stack.append({'tag': tag, 'class':attr[1]})
         return
 
@@ -148,7 +148,7 @@ def handle_endtag(self, tag):
         return
 
     def handle_data(self, data):
-        for c in some_class:
+        for c in some_classes:
             if self.stack != [] and self.stack[-1]['class'] == c:
                 self.tmp[c] = self.tmp.get(c, '') + data
         return

diff --git a/arxiv2bib.py b/arxiv2bib.py
@@ -0,0 +1,176 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# CC0, dedicated to public domain by Akihiro Uchida
+import argparse
+import urllib2, os
+from HTMLParser import HTMLParser
+
+class bibitem(object):
+    def __init__(self, bibtype):
+        assert isinstance(bibtype, str)
+        self.bibtype = bibtype
+        self.field = dict()
+        return
+
+    def add(self, dic):
+        assert isinstance(dic, dict)
+        for k, v in dic.iteritems():
+            self.field[k] = self.field.get(k, '') + v
+        return
+
+    def gen_key(self):
+        key = ''
+        if 'year' in self.field:
+            key += self.field['year']
+        if 'author' in self.field:
+            authors = self.field['author'].split('and')
+            for author in authors:
+                cnt = 0
+                for w in author.split():
+                    if cnt < len(w):
+                        (cnt, name) = (len(w), w.strip(',.'))
+                        key += name
+        if 'title' in self.field:
+            for w in self.field['title'].split():
+                key += w.title()
+                if len(w) > 4:
+                    break
+        return key
+
+    def dump(self):
+        d = '@{}{{{}'.format(self.bibtype, self.gen_key())
+        for k, v in self.field.iteritems():
+            if v not in ['', None]:
+                d += ',\n{}={{{}}}'.format(k, v)
+        d += '}\n'
+        return d
+
+class AbstParser(object):
+    def __init__(self):
+        self.parse = self.parse_main
+        self.text = ''
+        return
+
+    def feed(self, text):
+        i = 0
+        while i < len(text):
+            (self.parse, i) = self.parse(text, i)
+        return
+
+    def parse_main(self, text, i):
+        c = text[i]
+        if c == '"':
+            self.text += '``'
+            return (self.parse_quote, i+1)
+        if c == '-':
+            return (self.parse_hyphen, i+1)
+        else:
+            if c == '\n':
+                self.text += ' '
+            else:
+                self.text += c
+            return (self.parse_main, i+1)
+
+    def parse_quote(self, text, i):
+        c = text[i]
+        if c == '"':
+            self.text += '\'\''
+            return (self.parse_main, i+1)
+        else:
+            if c == '\n':
+                self.text += ' '
+            else:
+                self.text += c
+            return (self.parse_quote, i+1)
+
+    def parse_hyphen(self, text, i):
+        c = text[i]
+        if c not in (' ', '\n'):
+            self.text += '-'
+        return (self.parse_main, i+1)
+
+def normalize(cls, dic):
+    assert cls in dic
+    value = dic[cls]
+    result = dict()
+    if cls == 'title':
+        result['title'] = value.partition('Title:')[-1].strip('\n')
+    elif cls == 'authors':
+        result['author'] = ''
+        for c in value.partition('Authors:')[-1].strip('\n'):
+            if c == ',':
+                result['author'] += ' and '
+            else:
+                result['author'] += c
+    elif cls == 'dateline':
+        date = value.strip('()').split()
+        (result['year'], result['month']) = (date[4], date[3])
+    elif cls == 'abstract':
+        parser = AbstParser()
+        parser.feed(value.partition('Abstract:')[-1].strip())
+    # todo
+        result[cls] = parser.text
+    if cls.startswith('tablecell '):
+        c = cls.partition('tablecell ')[-1]
+        if c == 'arxivid':
+            result['eprint'] = value[:15]
+            result['url'] = 'http://arxiv.org/abs/{}'.format(value[6:15])
+        elif c == 'doi':
+            result[c] = value
+            result['doi-url'] = 'http://dx.doi.org/{}'.format(value)
+        else:
+            result[c] = dic[cls]
+    return result
+
+some_class = ('title', 'authors', 'dateline', 'abstract',
+              'tablecell comments', 'tablecell arxivid', 'tablecell subjects',
+              'tablecell jref', 'tablecell doi', 'tablecell report-number',
+              'tablecell msc-classes', 'tablecell acm-classes')
+
+class MyHTMLParser(HTMLParser):
+    def __init__(self):
+        HTMLParser.__init__(self)
+        self.item = bibitem('misc')
+        self.stack = []
+        self.tmp = dict()
+        return
+
+    def handle_starttag(self, tag, attrs):
+        for attr in attrs:
+            if attr[1] in some_class:
+                self.stack.append({'tag': tag, 'class':attr[1]})
+        return
+
+    def handle_endtag(self, tag):
+        if self.stack != [] and tag == self.stack[-1]['tag']:
+            s = self.stack.pop()
+            self.item.add(normalize(s['class'], self.tmp))
+        return
+
+    def handle_data(self, data):
+        for c in some_class:
+            if self.stack != [] and self.stack[-1]['class'] == c:
+                self.tmp[c] = self.tmp.get(c, '') + data
+        return
+
+if __name__ == '__main__':
+    try:
+        proxy = {'http': os.environ['http_proxy']}
+    except KeyError, e:
+        proxy = {}
+    handler = urllib2.ProxyHandler(proxy)
+    opener = urllib2.build_opener(handler)
+    arg_parser = argparse.ArgumentParser()
+    arg_parser.add_argument('url', type=str)
+    arg_parser.add_argument('-f', '--file', type=str,
+                            required=True)
+    args = arg_parser.parse_args()
+    parser = MyHTMLParser()
+    response = opener.open(args.url)
+    parser.feed(response.read())
+    response.close()
+    fpath = os.path.abspath(args.file)
+    fflag = 'a' if os.path.exists(fpath) else 'w'
+    with open(fpath, fflag) as f:
+        f.write(parser.item.dump())
+    parser.close()
No results found