#!/usr/bin/python #coding=utf8 import os import re import sys import pymysql #=============================================== # normalize input string to prevent from errors #=============================================== def str_normalize(str): try: res = '' if str is not None: strlist = str.split('\'') cnt = 0 for value in strlist: if(cnt > 0): res += '\\\'' cnt += 1 res += value except: print ("E001") sys.exit('E001') return res #=============================================== # Put many regular expression in proper order #=============================================== def get_regexlist(): try: regexlist = [] regexlist.append('.*\n(?P\S+)-\S+-(?P\d+)(?P\D+)(?P\d+)\n.*\n.*\s--\s(?P\S+)\n\S裁判字號\S\n.*\n\S裁判日期\S\n(?P\d+).*\n\S裁判案由\S\n(?P(.*\n)*)\S裁判全文\S\n((?:.*分院)|(?:.*法院))(?P[^ \s]+)[ \s]+.*號\n(?P(.*\n)*)[ \s]*主[ \s]+文[ \s]*\n+(?P
(.*\n)*)(?P[\s ]+事[ \s]+實.*)\n(?P(.*\n)*)(?P[ \s]*理[ \s]+由.*)\n(?P(.*\n)*?)(?P中[ \s]+華[ \s]+民[ \s]+國.*\n(?:.*\n)*).*') regexlist.append('.*\n(?P\S+)-\S+-(?P\d+)(?P\D+)(?P\d+)\n.*\n.*\s--\s(?P\S+)\n\S裁判字號\S\n.*\n\S裁判日期\S\n(?P\d+).*\n\S裁判案由\S\n(?P(.*\n)*)\S裁判全文\S\n((?:.*分院)|(?:.*法院))(?P[^ \s]+)[ \s]+.*號\n(?P(.*\n)*)[ \s]*主[ \s]+文[ \s]*\n+(?P
(.*\n)*)(?P[ \s]*事[ \s]*實及理[ \s]*由.*)\n(?P(.*\n)*?)(?P中[ \s]+華[ \s]+民[ \s]+國.*\n(?:.*\n)*).*') regexlist.append('.*\n(?P\S+)-\S+-(?P\d+)(?P\D+)(?P\d+)\n.*\n.*\s--\s(?P\S+)\n\S裁判字號\S\n.*\n\S裁判日期\S\n(?P\d+).*\n\S裁判案由\S\n(?P(.*\n)*)\S裁判全文\S\n((?:.*分院)|(?:.*法院))(?P[^ \s]+)[ \s]+.*號\n(?P(.*\n)*)[ \s]*主[ \s]+文[ \s]*\n+(?P
(.*\n)*)(?P[\s ]+事[ \s]+實.*)\n(?P(.*\n)*?)(?P中[ \s]+華[ \s]+民[ \s]+國.*\n(?:.*\n)*).*') regexlist.append('.*\n(?P\S+)-\S+-(?P\d+)(?P\D+)(?P\d+)\n.*\n.*\s--\s(?P\S+)\n\S裁判字號\S\n.*\n\S裁判日期\S\n(?P\d+).*\n\S裁判案由\S\n(?P(.*\n)*)\S裁判全文\S\n((?:.*分院)|(?:.*法院))(?P[^ \s]+)[ \s]+.*號\n(?P(.*\n)*)[ \s]*主[ \s]+文[ \s]*\n+(?P
(.*\n)*)(?P[ \s]*犯罪事實[ \s]*及[ \s]*理由.*)\n(?P(.*\n)*?)(?P中[ \s]+華[ \s]+民[ \s]+國.*\n(?:.*\n)*).*') regexlist.append('.*\n(?P\S+)-\S+-(?P\d+)(?P\D+)(?P\d+)\n.*\n.*\s--\s(?P\S+)\n\S裁判字號\S\n.*\n\S裁判日期\S\n(?P\d+).*\n\S裁判案由\S\n(?P(.*\n)*)\S裁判全文\S\n((?:.*分院)|(?:.*法院))(?P[^ \s]+)[ \s]+.*號\n(?P(.*\n)*)[ \s]*主[ \s]+文[ \s]*\n+(?P
(.*\n)*)(?P[ \s]*犯罪事實.*)\n(?P(.*\n)*?)(?P中[ \s]+華[ \s]+民[ \s]+國.*\n(?:.*\n)*).*') regexlist.append('.*\n(?P\S+)-\S+-(?P\d+)(?P\D+)(?P\d+)\n.*\n.*\s--\s(?P\S+)\n\S裁判字號\S\n.*\n\S裁判日期\S\n(?P\d+).*\n\S裁判案由\S\n(?P(.*\n)*)\S裁判全文\S\n((?:.*分院)|(?:.*法院))(?P[^ \s]+)[ \s]+.*號\n(?P(.*\n)*)[ \s]*主[ \s]+文[ \s]*\n+(?P
(.*\n)*)(?P[ \s]*理[ \s]+由.*)\n(?P(.*\n)*?)(?P中[ \s]+華[ \s]+民[ \s]+國.*\n(?:.*\n)*).*') regexlist.append('.*\n(?P\S+)-\S+-(?P\d+)(?P\D+)(?P\d+)\n.*\n.*\s--\s(?P\S+)\n\S裁判字號\S\n.*\n\S裁判日期\S\n(?P\d+).*\n\S裁判案由\S\n(?P(.*\n)*)\S裁判全文\S\n((?:.*分院)|(?:.*法院))(?P[^ \s]+)[ \s]+.*號\n*(?P(.*\n)*?)(?P中[ \s]+華[ \s]+民[ \s]+國.*\n(?:.*\n)*).*') except: print ("E001") sys.exit('E001') return regexlist #=============================================== # Get name of columns lied in table dynamically #=============================================== def get_all_column_name(db, cursor): try: sql = "SELECT column_name FROM INFORMATION_SCHEMA.COLUMNS WHERE table_name='refereebook'" cursor.execute(sql) mytuple = cursor.fetchall() db.commit() except: db.rollback() print ("E004") sys.exit('E004') try: columns = [] for emt in mytuple: columns.append(emt[0]) except: print ("E001") sys.exit('E001') return columns #=============================================== # Try to decompose the file #=============================================== def anylize(filepath,writeDB): # Check filepath try: os.path.isfile(filepath) except: print ("E001") sys.exit('E001') # Open database connectiion try: db = pymysql.connect(host="changethis", user="changethis", passwd="changethis", db="changethis", charset="utf8") cursor = db.cursor() except: print ("E002") sys.exit('E002') # Open the file and translate into a string try: fp = open(filepath, 'r', encoding="utf8") filecontent = "" for line in fp: filecontent += line except: print ("E001") sys.exit('E001') # Use regular expression try: columnlist = get_all_column_name(db, cursor) regexlist = get_regexlist() regex_ver = 0 for regex in regexlist: results = re.search(regex, filecontent) if(results is not None): break; regex_ver += 1 sqllist = [] for column in columnlist: if(re.search(column, regex) is not None): if(column == "Judgedate"): year = str(int(results.group(column)[0:3]) + 1911) month = results.group(column)[3:5] day = results.group(column)[5:7] sqllist.append(str_normalize(year + '-' + month + '-' + day)) else: sqllist.append(str_normalize(results.group(column))) else: sqllist.append(None) except Exception as e: print ("E003") sys.exit('E003') if(not writeDB): print (sqllist) return 0 # Construct SQL command and insert into table if(writeDB): try: sql = "INSERT INTO `refereebook`(" for column in columnlist[1:]: sql += "`{0}`".format(column) if(column == columnlist[-1]): sql += ")" break; else: sql += "," sql += "VALUES (" for emt in sqllist[1:]: if(emt is None): sql += 'NULL' else: sql += "\'{0}\'".format(emt) if(emt == sqllist[-1]): sql += ")" break; else: sql += "," cursor.execute(sql) db.commit() StatusCode = cursor.lastrowid except Exception as e: db.rollback() print (e) print ("E004") sys.exit('E004') return StatusCode #=============================================== # 'E001': Constant error # 'E002': Database connection error # 'E003': Regular expression error # 'E004': SQL fail # other: ID of successfully inserted record #=============================================== def main(argv = None): if argv is None: argv = sys.argv if(len(sys.argv) == 2): return anylize(sys.argv[1], True) if(len(sys.argv) == 3): return anylize(sys.argv[1], False) print ("E001") sys.exit('E001') if __name__ == "__main__": scode = main() print(scode) sys.exit(scode)