Skip to content

Instantly share code, notes, and snippets.

@vaibhaw
Forked from alegrm/App
Last active August 29, 2015 14:19
Show Gist options
  • Select an option

  • Save vaibhaw/56e069bda4e7d872f90f to your computer and use it in GitHub Desktop.

Select an option

Save vaibhaw/56e069bda4e7d872f90f to your computer and use it in GitHub Desktop.
package com.test.lucene;
import java.io.File;
import java.io.IOException;
import java.io.StringReader;
import java.util.Calendar;
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.document.Document;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.queries.mlt.MoreLikeThis;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.RAMDirectory;
import org.apache.lucene.util.Version;
import de.jetwick.snacktory.HtmlFetcher;
import de.jetwick.snacktory.JResult;
/**
* Testing processing time of FSDirectory and RAMDirectory
* using simple and MLT queries
*
*/
public class App
{
// private static String[] pages = {
// "http://insidetv.ew.com/2012/10/15/walking-dead-season-3-premiere-ratings/",
// "http://techcrunch.com/2012/10/16/facebook-rd-goes-global-opens-engineering-office-in-london-its-first-outside-the-u-s/",
// "http://www.bostondynamics.com/index.html",
// "http://www.readwriteweb.com/biz/2012/10/how-hard-is-it-to-get-and-use-a-3d-printer.php",
// "http://prettymuchamazing.com/news/album-stream-bat-for-lashes-the-haunted-man"};
//
private static String[] pages = {
"http://www.lemonde.fr/sciences/article/2012/10/11/serge-haroche-il-y-a-une-contradiction-entre-le-temps-des-politiques-et-le-temps-de-la-recherche_1773818_1650684.html",
"http://www.business-angel-france.com/business-angel-france-coup-de-gueule-faut-pas-prendre-les-business-angels-pour-des-mougeons",
"http://www.larecherche.fr/content/actualite-sante/article?id=32457",
"http://www.lesechos.fr/entreprises-secteurs/tech-medias/actu/0202329311759-open-data-l-etat-cherche-a-monetiser-les-donnees-publiques-500953.php",
"http://www.usinenouvelle.com/article/les-pme-poumons-essentiels-de-l-economie-europeenne.N183637"};
private static String[] terms = {
"Zombie",
"Vampire",
"Nigth",
"Wolf",
"Human"};
private static int factor = 1;
private static String[] content = new String[pages.length];
public static void main( String[] args )
{
System.out.println("Getting " + pages.length + " pages' content... ");
for(int i =0; i<pages.length ; i++){
JResult res;
try {
res = new HtmlFetcher().fetchAndExtract(pages[i], 10000, true);
content[i] = res.getTitle() + " " + res.getText();
} catch (Exception e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
// creates the index
try {
String fs_path = "/Users/alejandra/Developments/apache-solr-4.0.0/example/solr/data/topics_fr/index";
System.out.println("\nCreate FSIndex ...");
FSDirectory fsi = FSDirectory.open(new File(fs_path));
IndexReader reader = DirectoryReader.open(fsi);
System.out.println("SearchMLT queries with FSIndex");
long time = 0;
for(String c: content)
time+= searchMLT(reader, c);
System.out.println("...processed " + (pages.length * factor) +" in " + time);
System.out.println("Simple search with FSIndex");
time = 0;
for(String t: terms)
time+= searchIndex(reader, t);
System.out.println("...processed " + (terms.length * factor) +" in " + time);
reader.close();
/********* USE RAM DIR ***********/
System.out.println("\nCreating RAMDirectory... ");
time = Calendar.getInstance().getTimeInMillis();
RAMDirectory rami = new RAMDirectory(fsi, IOContext.DEFAULT);
System.out.println("...created index in " + (Calendar.getInstance().getTimeInMillis() -time));
reader = DirectoryReader.open(rami);
System.out.println("SearchMLT queries with RAMDirectory ");
time = 0;
for(String c: content)
time+= searchMLT(reader, c);
System.out.println("...processed " + (pages.length * factor) +" in " + time);
System.out.println("Simple search with RAMDirectory");
time = 0;
for(String t: terms)
time+= searchIndex(reader, t);
System.out.println("...processed " + (terms.length * factor) +" in " + time);
reader.close();
fsi.close();
rami.close();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (ParseException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
public static long searchMLT(IndexReader reader, String text) throws IOException{
long start = Calendar.getInstance().getTimeInMillis();
IndexSearcher searcher = new IndexSearcher(reader);
MoreLikeThis mlt = new MoreLikeThis(reader); // Pass the index reader
mlt.setAnalyzer(new StandardAnalyzer(Version.LUCENE_40));
mlt.setFieldNames(new String[] {"text"});
mlt.setMinWordLen(3);
mlt.setMinTermFreq(3);
mlt.setMinDocFreq(2);
Query query = mlt.like( new StringReader(text), "text");
// System.out.println(text.substring(0, text.indexOf(" ", 100)) );
TopDocs topDocs = searcher.search(query, 5);
// System.out.println(" hits:" + topDocs.totalHits);
for ( ScoreDoc scoreDoc : topDocs.scoreDocs ) {
Document doc = searcher.doc( scoreDoc.doc );
// System.out.print(" " + doc.getField("id").stringValue());
}
// System.out.println();
return (Calendar.getInstance().getTimeInMillis()-start);
}
public static long searchIndex(IndexReader reader, String searchString) throws IOException, ParseException {
long start = Calendar.getInstance().getTimeInMillis();
IndexSearcher indexSearcher = new IndexSearcher(reader);
QueryParser parser = new QueryParser(Version.LUCENE_40, "text", new StandardAnalyzer(Version.LUCENE_40));
Query query = parser.parse(searchString);
ScoreDoc[] hits = indexSearcher.search(query, null, 10).scoreDocs;
// System.out.println(" hits:" + hits.length);
// Iterate through the results:
for (int i = 0; i < hits.length; i++) {
Document hitDoc = indexSearcher.doc(hits[i].doc);
}
return (Calendar.getInstance().getTimeInMillis()-start);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment