Skip to content

Instantly share code, notes, and snippets.

@anothernoise
Forked from travisbrown/build.sbt
Created October 13, 2015 08:58
Show Gist options
  • Select an option

  • Save anothernoise/fc50a8ee64d3eed56919 to your computer and use it in GitHub Desktop.

Select an option

Save anothernoise/fc50a8ee64d3eed56919 to your computer and use it in GitHub Desktop.

Revisions

  1. @travisbrown travisbrown revised this gist Jul 30, 2013. 1 changed file with 5 additions and 5 deletions.
    10 changes: 5 additions & 5 deletions http.scala
    Original file line number Diff line number Diff line change
    @@ -15,12 +15,12 @@ object Searcher {
    yearRange: (Int, Int),
    page: Option[Int] = None
    ) = pageReq <<? Map(
    "format" -> "json",
    "date1" -> yearRange._1.toString,
    "date2" -> yearRange._2.toString,
    "format" -> "json",
    "date1" -> yearRange._1.toString,
    "date2" -> yearRange._2.toString,
    "dateFilterType" -> "yearRange",
    "andtext" -> conj.mkString(" "),
    "ortext" -> disj.mkString(" ")
    "andtext" -> conj.mkString(" "),
    "ortext" -> disj.mkString(" ")
    ) ++ page.map("page" -> _.toString)

    def retrievePage(
  2. @travisbrown travisbrown revised this gist Jul 30, 2013. 1 changed file with 3 additions and 5 deletions.
    8 changes: 3 additions & 5 deletions http.scala
    Original file line number Diff line number Diff line change
    @@ -18,11 +18,9 @@ object Searcher {
    "format" -> "json",
    "date1" -> yearRange._1.toString,
    "date2" -> yearRange._2.toString,
    "dateFilterType" -> "yearRange"
    ) ++ conj.headOption.map(
    const("andtext" -> conj.mkString(" "))
    ) ++ disj.headOption.map(
    const("ortext" -> disj.mkString(" "))
    "dateFilterType" -> "yearRange",
    "andtext" -> conj.mkString(" "),
    "ortext" -> disj.mkString(" ")
    ) ++ page.map("page" -> _.toString)

    def retrievePage(
  3. @travisbrown travisbrown created this gist Jul 29, 2013.
    7 changes: 7 additions & 0 deletions build.sbt
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,7 @@
    scalaVersion := "2.10.2"

    libraryDependencies ++= Seq(
    "net.databinder.dispatch" %% "dispatch-core" % "0.11.0",
    "net.databinder.dispatch" %% "dispatch-json4s-jackson" % "0.11.0",
    "net.sf.opencsv" % "opencsv" % "2.0"
    )
    60 changes: 60 additions & 0 deletions http.scala
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,60 @@
    object Searcher {
    import dispatch.{ Http, url }, dispatch.Defaults._, dispatch.as
    import org.json4s._
    import scala.Function.const
    import scala.concurrent.Future
    import scala.util._

    implicit val formats = DefaultFormats

    val pageReq = url("http://chroniclingamerica.loc.gov/search/pages/results/")

    def constructRequest(
    conj: List[String],
    disj: List[String],
    yearRange: (Int, Int),
    page: Option[Int] = None
    ) = pageReq <<? Map(
    "format" -> "json",
    "date1" -> yearRange._1.toString,
    "date2" -> yearRange._2.toString,
    "dateFilterType" -> "yearRange"
    ) ++ conj.headOption.map(
    const("andtext" -> conj.mkString(" "))
    ) ++ disj.headOption.map(
    const("ortext" -> disj.mkString(" "))
    ) ++ page.map("page" -> _.toString)

    def retrievePage(
    conj: List[String],
    disj: List[String],
    yearRange: (Int, Int),
    page: Option[Int] = None
    ) = Http(constructRequest(conj, disj, yearRange, page) OK as.json4s.Json)

    def search(
    conj: List[String],
    disj: List[String],
    yearRange: (Int, Int)
    ) = retrievePage(conj, disj, yearRange).flatMap { json =>
    val results = json.extract[ResultSet]

    Future.traverse(2 to results.pageCount) { i =>
    retrievePage(conj, disj, yearRange, Some(i)).map(
    _.extract[ResultSet].items
    )
    }.map(results.items ++ _.flatten)
    }

    // Asynchronously begin downloads and print result when completed.
    def saveSearchResults(
    conj: List[String],
    disj: List[String],
    yearRange: (Int, Int) )(path: String) = search(conj, disj, yearRange).onComplete {
    case Success(items) =>
    println("Successfully downloaded %d items!".format(items.size))
    CsvOutput.writeItems(items)(path)
    case Failure(e) =>
    println("There was a problem: %s".format(e))
    }
    }
    34 changes: 34 additions & 0 deletions model.scala
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,34 @@
    // Represents a single newspaper page.
    case class Item(
    id: String,
    url: String,
    lccn: String,
    date: String,
    sequence: Int,
    title: String,
    ocr_eng: String
    ) {
    def pubId = id match {
    case Item.PubIdPattern(pubId) => pubId
    }

    def formattedDate = date match {
    case Item.DatePattern(year, month, day) => "%s-%s-%s".format(year, month, day)
    }
    }

    // Some helpers for working with item data.
    object Item {
    val DatePattern = """(\d\d\d\d)(\d\d)(\d\d)""".r
    val PubIdPattern = """/lccn/([^/]+)/.*""".r
    }

    // Represents a set of search query results.
    case class ResultSet(totalItems: Int, itemsPerPage: Int, items: List[Item]) {
    def pageCount = {
    val quot = (totalItems / itemsPerPage).toInt
    val remd = (totalItems % itemsPerPage).toInt

    quot + math.signum(remd)
    }
    }
    23 changes: 23 additions & 0 deletions output.scala
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,23 @@
    // The boring stuff: writing the CSV file.
    object CsvOutput {
    import au.com.bytecode.opencsv.CSVWriter

    def writeItems(items: List[Item])(path: String) = {
    val writer = new CSVWriter(new java.io.FileWriter(path))

    items.foreach { item =>
    writer.writeNext(
    Array(
    item.pubId,
    item.title,
    item.formattedDate,
    item.sequence.toString,
    item.url,
    item.ocr_eng.replaceAll("\n", " ")
    )
    )
    }

    writer.close()
    }
    }