-
-
Save travisbrown/4bad84b4729c7a221e30 to your computer and use it in GitHub Desktop.
Revisions
-
travisbrown revised this gist
Jan 22, 2015 . 1 changed file with 37 additions and 33 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -14,49 +14,53 @@ object parse extends RegexParsers { def plain: Parser[Plain] = "[^<>\\-\\+]+".r ^^ (Plain(_)) def elements: Parser[List[Element]] = rep(tag | header | plain) def apply(s: String) = parseAll(elements, s) def groupByTag(els: List[Element]) = { val (last, rest) = els.foldLeft((List.empty[Element], List.empty[List[Element]])) { case ((Nil, done), tag @ Tag(_, _)) => (List(tag), done) case ((current, done), tag @ Tag(_, _)) => (List(tag), done :+ current) case ((current, done), other) => (current :+ other, done) } rest :+ last } } object run { def main(args: Array[String]) { val input = io.Source.fromFile( "./edF.txt" // ).getLines.take(15522).mkString(" ") // Pars Prima // ).getLines.take(59012).mkString(" ") // Pars Prima and Pars Secunda ).getLines.take(64523).mkString(" ") // entire file parse(input).map(parse.groupByTag).get.map { case Tag("1", "C") :: _ => // case start case Tag("1", "D") :: _ => // distinction start case Tag("1", "DC") :: _ => // case Tag("1", "DP") :: _ => // case Tag("2", _) :: _ => // distinction number case Tag("3", _) :: _ => // question number case Tag("4", _) :: _ => // capitulum case Tag("L", _) :: _ => // line case Tag("P", "0") :: _ => // Palea end case Tag("P", "1") :: _ => // Palea start case Tag("S", _) :: _ => // page case Tag("T", "A") :: _ => // dictum ante case Tag("T", "I") :: _ => // inscription case Tag("T", "P") :: _ => // dictum post case Tag("T", "Q") :: _ => // case statement case Tag("T", "R") :: body => val content = body.collect { case Plain(text) => text.trim }.mkString(" ") println(content.trim) case Tag("T", "T") :: _ => // text case Tag(_, _) :: _ => // error // case Tag(name, attr) => println(name + "\t" + attr) case _ => // do nothing } } } -
decretist revised this gist
Jan 22, 2015 . 1 changed file with 4 additions and 4 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,14 +1,14 @@ 6714256 ======= edF.txt 2192,2194: ``` <4 7> -[C. VII.]+ <T R> Ait enim Gregorius I., -[lib. VII. reg. epist. 112.]+ Siagrio Episcopo Augustodunensi: <T T> ... ``` edF.txt 10695,10697: ``` <4 29> -C. XXIX.+ <T R> Inconsultis legatis inperatoris Adrianus -[II.]+ ad pontificatum eligitur. <T T> ... ``` -
decretist revised this gist
Jan 22, 2015 . 1 changed file with 4 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,10 +1,14 @@ 6714256 ======= edF.txt 2192,2194: ... <4 7> -[C. VII.]+ <T R> Ait enim Gregorius I., -[lib. VII. reg. epist. 112.]+ Siagrio Episcopo Augustodunensi: <T T> ... ... edF.txt 10695,10697: ... <4 29> -C. XXIX.+ <T R> Inconsultis legatis inperatoris Adrianus -[II.]+ ad pontificatum eligitur. <T T> ... ... -
decretist revised this gist
Jan 21, 2015 . 1 changed file with 10 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,10 @@ 6714256 ======= edF.txt 2192,2194: <4 7> -[C. VII.]+ <T R> Ait enim Gregorius I., -[lib. VII. reg. epist. 112.]+ Siagrio Episcopo Augustodunensi: <T T> ... edF.txt 10695,10697: <4 29> -C. XXIX.+ <T R> Inconsultis legatis inperatoris Adrianus -[II.]+ ad pontificatum eligitur. <T T> ... -
decretist revised this gist
Jan 20, 2015 . 2 changed files with 64524 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1 +0,0 @@ -
decretist revised this gist
Jan 20, 2015 . 5 changed files with 10304 additions and 7 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -23,7 +23,8 @@ object run { val input = io.Source.fromFile( "./edF.txt" // ).getLines.take(15522).mkString(" ") // Pars Prima // ).getLines.take(59012).mkString(" ") // Pars Prima and Pars Secunda ).getLines.take(64523).mkString(" ") // entire file // parse(input).get foreach println parse(input).get.map { case Tag("1", "C") => // case start @@ -37,20 +38,21 @@ object run { case Tag("P", "0") => // Palea end case Tag("P", "1") => // Palea start case Tag("S", _) => // page case Tag("T", "A") => // dictum ante case Tag("T", "I") => // inscription case Tag("T", "P") => // dictum post case Tag("T", "Q") => // case statement case Tag("T", "R") => print = true // rubric case Tag("T", "T") => // text case Tag(_, _) => // error // case Tag(name, attr) => println(name + "\t" + attr) case Header(text) => // do nothing case Plain(text) => // do nothing val spaces = "\\s+".r if (print) { if (!spaces.pattern.matcher(text).matches) { // println(f"$index%04d" + " _ " + text) println(text.trim()) index = index + 1 print = false } This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,29 @@ #!/usr/bin/python # # Paul Evans ([email protected]) # from __future__ import print_function import re import sys def main(): f = open('./edF.txt', 'r') file = f.read() toc = open('./toc.txt', 'r') dictionary = {} # (?<=...) positive lookbehind assertion. rubrics = re.findall('(?:\<T R\>|(?<=\<T R\>))(.*?)' # rubric starts with rubric (<T R>) tag. '(?:' # non-capturing group. '\<T [IPT]\>' # rubric ends with inscription, dictum (post), or text tag. ')', file, re.S) # re.S (re.DOTALL) makes '.' special character match any character including newline. print('expected 3422 rubrics, found ' + str(len(rubrics)) + ' rubrics', file=sys.stderr) for rubric in rubrics: rubric = re.sub('\<P 1\> \-\[PALEA\.\+', '', rubric) # remove Palea tag. rubric = re.sub(re.compile('\-\[.*?\]\+', re.S), '', rubric) rubric = re.sub('\-.*?\+', '', rubric) rubric = re.sub('\s+', ' ', rubric) rubric = re.sub('^\s+', '', rubric) # remove leading whitespace characters rubric = re.sub('\s+$', '', rubric) # remove trailing whitespace characters print(rubric) if __name__ == '__main__': main() -
decretist revised this gist
Oct 2, 2013 . 1 changed file with 2 additions and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -47,8 +47,9 @@ object run { // case Tag(name, attr) => println(name + "\t" + attr) case Header(text) => // do nothing case Plain(text) => // do nothing val spaces = "\\s+".r if (print) { if (!spaces.pattern.matcher(text).matches) { println(f"$index%04d" + " _ " + text) index = index + 1 print = false -
decretist revised this gist
Sep 30, 2013 . 1 changed file with 16 additions and 6 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -16,11 +16,14 @@ object parse extends RegexParsers { def apply(s: String) = parseAll(elements, s) } object run { def main(args: Array[String]) { var index = 1 var print = false val input = io.Source.fromFile( "./edF.txt" // ).getLines.take(15522).mkString(" ") // Pars Prima ).getLines.take(59011).mkString(" ") // Pars Prima and Pars Secunda // parse(input).get foreach println parse(input).get.map { case Tag("1", "C") => // case start @@ -34,16 +37,23 @@ object Process { case Tag("P", "0") => // Palea end case Tag("P", "1") => // Palea start case Tag("S", _) => // page case Tag("T", "A") => print = true // dictum ante case Tag("T", "I") => // inscription case Tag("T", "P") => print = true // dictum post case Tag("T", "Q") => print = true // case statement case Tag("T", "R") => // rubric case Tag("T", "T") => print = true // text case Tag(_, _) => // error // case Tag(name, attr) => println(name + "\t" + attr) case Header(text) => // do nothing case Plain(text) => // do nothing if (print) { if (text != " " && text != " " && text != " ") { println(f"$index%04d" + " _ " + text) index = index + 1 print = false } } } } } -
decretist revised this gist
Sep 27, 2013 . 1 changed file with 6 additions and 0 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -23,8 +23,12 @@ object Process { ).getLines.take(15522).mkString(" ") // Prima Pars // parse(input).get foreach println parse(input).get.map { case Tag("1", "C") => // case start case Tag("1", "D") => // distinction start case Tag("1", "DC") => // case Tag("1", "DP") => // case Tag("2", _) => // distinction number case Tag("3", _) => // question number case Tag("4", _) => // capitulum case Tag("L", _) => // line case Tag("P", "0") => // Palea end @@ -33,9 +37,11 @@ object Process { case Tag("T", "A") => // dictum ante case Tag("T", "I") => // inscription case Tag("T", "P") => // dictum post case Tag("T", "Q") => // case statement case Tag("T", "R") => // rubric case Tag("T", "T") => // text case Tag(_, _) => // error // case Tag(name, attr) => println(name + "\t" + attr) case Header(text) => // do nothing case Plain(text) => // do nothing } -
decretist revised this gist
Sep 27, 2013 . 2 changed files with 15 additions and 3 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1 @@ edF.txt This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -20,11 +20,22 @@ object Process { def main(args: Array[String]) { val input = io.Source.fromFile( "./edF.txt" ).getLines.take(15522).mkString(" ") // Prima Pars // parse(input).get foreach println parse(input).get.map { case Tag("1", "D") => // distinction start case Tag("2", _) => // distinction number case Tag("4", _) => // capitulum case Tag("L", _) => // line case Tag("P", "0") => // Palea end case Tag("P", "1") => // Palea start case Tag("S", _) => // page case Tag("T", "A") => // dictum ante case Tag("T", "I") => // inscription case Tag("T", "P") => // dictum post case Tag("T", "R") => // rubric case Tag("T", "T") => // text case Tag(_, _) => // error case Header(text) => // do nothing case Plain(text) => // do nothing } -
decretist renamed this gist
Sep 26, 2013 . 1 changed file with 0 additions and 0 deletions.There are no files selected for viewing
File renamed without changes. -
decretist revised this gist
Sep 26, 2013 . 1 changed file with 14 additions and 8 deletions.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,26 +1,32 @@ import scala.util.parsing.combinator._ sealed trait Element case class Tag(name: String, attr: String) extends Element case class Header(text: String) extends Element case class Plain(text: String) extends Element object parse extends RegexParsers { override def skipWhitespace = false def tag: Parser[Tag] = ("<" ~> "\\S+".r <~ "\\s".r) ~ "[^\\s>]+".r <~ ">" ^^ { case name ~ attr => Tag(name, attr) } def header: Parser[Header] = "\\-".r ~> "[^+]*".r <~ "+" ^^ (Header(_)) def plain: Parser[Plain] = "[^<>\\-\\+]+".r ^^ (Plain(_)) def elements: Parser[List[Element]] = rep(tag | header | plain) def apply(s: String) = parseAll(elements, s) } object Process { def main(args: Array[String]) { val input = io.Source.fromFile( "./edF.txt" ).getLines.take(15522).mkString(" ") // parse(input).get foreach println parse(input).get.map { case Tag(name, attr) => { println(name + "\t" + attr) } case Header(text) => // do nothing case Plain(text) => // do nothing } } } -
decretist created this gist
Sep 26, 2013 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,26 @@ import scala.util.parsing.combinator._ sealed trait Element case class Comment(s: String) extends Element case class Tag(name: String, attr: String) extends Element case class Text(s: String) extends Element object parse extends RegexParsers { override def skipWhitespace = false def comment: Parser[Comment] = "\\-".r ~> "[^+]*".r <~ "+" ^^ (Comment(_)) def tag: Parser[Tag] = ("<" ~> "\\S+".r <~ "\\s".r) ~ "[^\\s>]+".r <~ ">" ^^ { case name ~ attr => Tag(name, attr) } def text: Parser[Text] = "[^<>\\-\\+]+".r ^^ (Text(_)) def elements: Parser[List[Element]] = rep(comment | tag | text) def apply(s: String) = parseAll(elements, s) } object Test { def main(args: Array[String]) { val input = io.Source.fromFile( "./edF.txt" ).getLines.take(10000).mkString(" ") parse(input).get foreach println } }