-
-
Save travisbrown/4bad84b4729c7a221e30 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| edF.txt |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import scala.util.parsing.combinator._ | |
| sealed trait Element | |
| case class Tag(name: String, attr: String) extends Element | |
| case class Header(text: String) extends Element | |
| case class Plain(text: String) extends Element | |
| object parse extends RegexParsers { | |
| override def skipWhitespace = false | |
| def tag: Parser[Tag] = ("<" ~> "\\S+".r <~ "\\s".r) ~ "[^\\s>]+".r <~ ">" ^^ { | |
| case name ~ attr => Tag(name, attr) | |
| } | |
| def header: Parser[Header] = "\\-".r ~> "[^+]*".r <~ "+" ^^ (Header(_)) | |
| def plain: Parser[Plain] = "[^<>\\-\\+]+".r ^^ (Plain(_)) | |
| def elements: Parser[List[Element]] = rep(tag | header | plain) | |
| def apply(s: String) = parseAll(elements, s) | |
| } | |
| object Process { | |
| def main(args: Array[String]) { | |
| val input = io.Source.fromFile( | |
| "./edF.txt" | |
| ).getLines.take(15522).mkString(" ") // Prima Pars | |
| // parse(input).get foreach println | |
| parse(input).get.map { | |
| case Tag("1", "D") => // distinction start | |
| case Tag("2", _) => // distinction number | |
| case Tag("4", _) => // capitulum | |
| case Tag("L", _) => // line | |
| case Tag("P", "0") => // Palea end | |
| case Tag("P", "1") => // Palea start | |
| case Tag("S", _) => // page | |
| case Tag("T", "A") => // dictum ante | |
| case Tag("T", "I") => // inscription | |
| case Tag("T", "P") => // dictum post | |
| case Tag("T", "R") => // rubric | |
| case Tag("T", "T") => // text | |
| case Tag(_, _) => // error | |
| case Header(text) => // do nothing | |
| case Plain(text) => // do nothing | |
| } | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment