-
-
Save vaibhaw/56e069bda4e7d872f90f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| package com.test.lucene; | |
| import java.io.File; | |
| import java.io.IOException; | |
| import java.io.StringReader; | |
| import java.util.Calendar; | |
| import org.apache.lucene.analysis.Analyzer; | |
| import org.apache.lucene.analysis.standard.StandardAnalyzer; | |
| import org.apache.lucene.document.Document; | |
| import org.apache.lucene.index.DirectoryReader; | |
| import org.apache.lucene.index.IndexReader; | |
| import org.apache.lucene.queries.mlt.MoreLikeThis; | |
| import org.apache.lucene.queryparser.classic.ParseException; | |
| import org.apache.lucene.queryparser.classic.QueryParser; | |
| import org.apache.lucene.search.IndexSearcher; | |
| import org.apache.lucene.search.Query; | |
| import org.apache.lucene.search.ScoreDoc; | |
| import org.apache.lucene.search.TopDocs; | |
| import org.apache.lucene.store.Directory; | |
| import org.apache.lucene.store.FSDirectory; | |
| import org.apache.lucene.store.IOContext; | |
| import org.apache.lucene.store.RAMDirectory; | |
| import org.apache.lucene.util.Version; | |
| import de.jetwick.snacktory.HtmlFetcher; | |
| import de.jetwick.snacktory.JResult; | |
| /** | |
| * Testing processing time of FSDirectory and RAMDirectory | |
| * using simple and MLT queries | |
| * | |
| */ | |
| public class App | |
| { | |
| // private static String[] pages = { | |
| // "http://insidetv.ew.com/2012/10/15/walking-dead-season-3-premiere-ratings/", | |
| // "http://techcrunch.com/2012/10/16/facebook-rd-goes-global-opens-engineering-office-in-london-its-first-outside-the-u-s/", | |
| // "http://www.bostondynamics.com/index.html", | |
| // "http://www.readwriteweb.com/biz/2012/10/how-hard-is-it-to-get-and-use-a-3d-printer.php", | |
| // "http://prettymuchamazing.com/news/album-stream-bat-for-lashes-the-haunted-man"}; | |
| // | |
| private static String[] pages = { | |
| "http://www.lemonde.fr/sciences/article/2012/10/11/serge-haroche-il-y-a-une-contradiction-entre-le-temps-des-politiques-et-le-temps-de-la-recherche_1773818_1650684.html", | |
| "http://www.business-angel-france.com/business-angel-france-coup-de-gueule-faut-pas-prendre-les-business-angels-pour-des-mougeons", | |
| "http://www.larecherche.fr/content/actualite-sante/article?id=32457", | |
| "http://www.lesechos.fr/entreprises-secteurs/tech-medias/actu/0202329311759-open-data-l-etat-cherche-a-monetiser-les-donnees-publiques-500953.php", | |
| "http://www.usinenouvelle.com/article/les-pme-poumons-essentiels-de-l-economie-europeenne.N183637"}; | |
| private static String[] terms = { | |
| "Zombie", | |
| "Vampire", | |
| "Nigth", | |
| "Wolf", | |
| "Human"}; | |
| private static int factor = 1; | |
| private static String[] content = new String[pages.length]; | |
| public static void main( String[] args ) | |
| { | |
| System.out.println("Getting " + pages.length + " pages' content... "); | |
| for(int i =0; i<pages.length ; i++){ | |
| JResult res; | |
| try { | |
| res = new HtmlFetcher().fetchAndExtract(pages[i], 10000, true); | |
| content[i] = res.getTitle() + " " + res.getText(); | |
| } catch (Exception e) { | |
| // TODO Auto-generated catch block | |
| e.printStackTrace(); | |
| } | |
| } | |
| // creates the index | |
| try { | |
| String fs_path = "/Users/alejandra/Developments/apache-solr-4.0.0/example/solr/data/topics_fr/index"; | |
| System.out.println("\nCreate FSIndex ..."); | |
| FSDirectory fsi = FSDirectory.open(new File(fs_path)); | |
| IndexReader reader = DirectoryReader.open(fsi); | |
| System.out.println("SearchMLT queries with FSIndex"); | |
| long time = 0; | |
| for(String c: content) | |
| time+= searchMLT(reader, c); | |
| System.out.println("...processed " + (pages.length * factor) +" in " + time); | |
| System.out.println("Simple search with FSIndex"); | |
| time = 0; | |
| for(String t: terms) | |
| time+= searchIndex(reader, t); | |
| System.out.println("...processed " + (terms.length * factor) +" in " + time); | |
| reader.close(); | |
| /********* USE RAM DIR ***********/ | |
| System.out.println("\nCreating RAMDirectory... "); | |
| time = Calendar.getInstance().getTimeInMillis(); | |
| RAMDirectory rami = new RAMDirectory(fsi, IOContext.DEFAULT); | |
| System.out.println("...created index in " + (Calendar.getInstance().getTimeInMillis() -time)); | |
| reader = DirectoryReader.open(rami); | |
| System.out.println("SearchMLT queries with RAMDirectory "); | |
| time = 0; | |
| for(String c: content) | |
| time+= searchMLT(reader, c); | |
| System.out.println("...processed " + (pages.length * factor) +" in " + time); | |
| System.out.println("Simple search with RAMDirectory"); | |
| time = 0; | |
| for(String t: terms) | |
| time+= searchIndex(reader, t); | |
| System.out.println("...processed " + (terms.length * factor) +" in " + time); | |
| reader.close(); | |
| fsi.close(); | |
| rami.close(); | |
| } catch (IOException e) { | |
| // TODO Auto-generated catch block | |
| e.printStackTrace(); | |
| } catch (ParseException e) { | |
| // TODO Auto-generated catch block | |
| e.printStackTrace(); | |
| } | |
| } | |
| public static long searchMLT(IndexReader reader, String text) throws IOException{ | |
| long start = Calendar.getInstance().getTimeInMillis(); | |
| IndexSearcher searcher = new IndexSearcher(reader); | |
| MoreLikeThis mlt = new MoreLikeThis(reader); // Pass the index reader | |
| mlt.setAnalyzer(new StandardAnalyzer(Version.LUCENE_40)); | |
| mlt.setFieldNames(new String[] {"text"}); | |
| mlt.setMinWordLen(3); | |
| mlt.setMinTermFreq(3); | |
| mlt.setMinDocFreq(2); | |
| Query query = mlt.like( new StringReader(text), "text"); | |
| // System.out.println(text.substring(0, text.indexOf(" ", 100)) ); | |
| TopDocs topDocs = searcher.search(query, 5); | |
| // System.out.println(" hits:" + topDocs.totalHits); | |
| for ( ScoreDoc scoreDoc : topDocs.scoreDocs ) { | |
| Document doc = searcher.doc( scoreDoc.doc ); | |
| // System.out.print(" " + doc.getField("id").stringValue()); | |
| } | |
| // System.out.println(); | |
| return (Calendar.getInstance().getTimeInMillis()-start); | |
| } | |
| public static long searchIndex(IndexReader reader, String searchString) throws IOException, ParseException { | |
| long start = Calendar.getInstance().getTimeInMillis(); | |
| IndexSearcher indexSearcher = new IndexSearcher(reader); | |
| QueryParser parser = new QueryParser(Version.LUCENE_40, "text", new StandardAnalyzer(Version.LUCENE_40)); | |
| Query query = parser.parse(searchString); | |
| ScoreDoc[] hits = indexSearcher.search(query, null, 10).scoreDocs; | |
| // System.out.println(" hits:" + hits.length); | |
| // Iterate through the results: | |
| for (int i = 0; i < hits.length; i++) { | |
| Document hitDoc = indexSearcher.doc(hits[i].doc); | |
| } | |
| return (Calendar.getInstance().getTimeInMillis()-start); | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment