vaibhaw · August 29, 2015 14:19
diff --git a/lucene-ramdir-fsdir.java b/lucene-ramdir-fsdir.java
 package com.test.lucene;

 import java.io.File;
 import java.io.IOException;
 import java.io.StringReader;
 import java.util.Calendar;

 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.standard.StandardAnalyzer;
 import org.apache.lucene.document.Document;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.queries.mlt.MoreLikeThis;
 import org.apache.lucene.queryparser.classic.ParseException;
 import org.apache.lucene.queryparser.classic.QueryParser;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.ScoreDoc;
 import org.apache.lucene.search.TopDocs;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.FSDirectory;
 import org.apache.lucene.store.IOContext;
 import org.apache.lucene.store.RAMDirectory;
 import org.apache.lucene.util.Version;

 import de.jetwick.snacktory.HtmlFetcher;
 import de.jetwick.snacktory.JResult;

 /**
 * Testing processing time of FSDirectory and RAMDirectory
 * using simple and MLT queries
 *
 */
 public class App 
 {
 //	private static String[] pages = {
 //		"http://insidetv.ew.com/2012/10/15/walking-dead-season-3-premiere-ratings/",
 //		"http://techcrunch.com/2012/10/16/facebook-rd-goes-global-opens-engineering-office-in-london-its-first-outside-the-u-s/",
 //		"http://www.bostondynamics.com/index.html",
 //		"http://www.readwriteweb.com/biz/2012/10/how-hard-is-it-to-get-and-use-a-3d-printer.php",
 //		"http://prettymuchamazing.com/news/album-stream-bat-for-lashes-the-haunted-man"};
 //	
 	
 	private static String[] pages = {
 		"http://www.lemonde.fr/sciences/article/2012/10/11/serge-haroche-il-y-a-une-contradiction-entre-le-temps-des-politiques-et-le-temps-de-la-recherche_1773818_1650684.html",
 		"http://www.business-angel-france.com/business-angel-france-coup-de-gueule-faut-pas-prendre-les-business-angels-pour-des-mougeons",
 		"http://www.larecherche.fr/content/actualite-sante/article?id=32457",
 		"http://www.lesechos.fr/entreprises-secteurs/tech-medias/actu/0202329311759-open-data-l-etat-cherche-a-monetiser-les-donnees-publiques-500953.php",
 		"http://www.usinenouvelle.com/article/les-pme-poumons-essentiels-de-l-economie-europeenne.N183637"};
 	
 	
 	private static String[] terms = {
 		"Zombie",
 		"Vampire",
 		"Nigth",
 		"Wolf",
 		"Human"};
 	
 	private static int factor = 1;
 	private static String[] content = new String[pages.length];
 	
    public static void main( String[] args )
    {
    	
    	System.out.println("Getting " + pages.length + " pages' content... ");
    	for(int i =0; i<pages.length ; i++){
    		JResult res;
 			try {
 				res = new HtmlFetcher().fetchAndExtract(pages[i], 10000, true);
 				content[i]  = res.getTitle() + " " + res.getText();
 			} catch (Exception e) {
 				// TODO Auto-generated catch block
 				e.printStackTrace();
 			}
    	}
    	
    	
 		// creates the index
 		
    	try {
    		String fs_path = "/Users/alejandra/Developments/apache-solr-4.0.0/example/solr/data/topics_fr/index";
        	
    		System.out.println("\nCreate FSIndex ...");
        	FSDirectory fsi =  FSDirectory.open(new File(fs_path));
        	IndexReader reader =  DirectoryReader.open(fsi); 
    		

    		System.out.println("SearchMLT queries with FSIndex");
    		long time = 0;
    		for(String c: content)
    			time+= searchMLT(reader, c);
    		System.out.println("...processed " + (pages.length * factor) +" in " + time);
    		
    		System.out.println("Simple search with FSIndex");
    		time = 0;
    		for(String t: terms)
    			time+= searchIndex(reader, t);
    		System.out.println("...processed " + (terms.length * factor) +" in " + time);
    		
    		reader.close();
    		
    		/********* USE RAM DIR ***********/
    		
    		System.out.println("\nCreating RAMDirectory... ");
    		time = Calendar.getInstance().getTimeInMillis();
    		RAMDirectory rami = new RAMDirectory(fsi, IOContext.DEFAULT);
        	System.out.println("...created index  in " + (Calendar.getInstance().getTimeInMillis() -time));
        	
        	reader =  DirectoryReader.open(rami); 
        	
        	System.out.println("SearchMLT queries with RAMDirectory ");
        	time = 0;
    		for(String c: content)
    			time+= searchMLT(reader, c);
    		System.out.println("...processed " + (pages.length * factor) +" in " + time);
    		

    		System.out.println("Simple search with RAMDirectory");
    		time = 0;
    		for(String t: terms)
    			time+= searchIndex(reader, t);
    		System.out.println("...processed " + (terms.length * factor) +" in " + time);
    		
    		
    		reader.close();
 		
    		fsi.close();
    		rami.close();
    	
    	} catch (IOException e) {
 			// TODO Auto-generated catch block
 			e.printStackTrace();
 		} catch (ParseException e) {
 			// TODO Auto-generated catch block
 			e.printStackTrace();
 		} 
    	
    	
    	
    }
    
    
    
    public static long searchMLT(IndexReader reader, String text) throws IOException{
 		
    	long start = Calendar.getInstance().getTimeInMillis();
    	
    	IndexSearcher searcher =  new IndexSearcher(reader);
 		MoreLikeThis mlt = new MoreLikeThis(reader); // Pass the index reader
 		mlt.setAnalyzer(new StandardAnalyzer(Version.LUCENE_40));
 		mlt.setFieldNames(new String[] {"text"});
 		mlt.setMinWordLen(3);
 		mlt.setMinTermFreq(3);
 		mlt.setMinDocFreq(2);
 		
 		Query query = mlt.like( new StringReader(text), "text");
 		
 //		System.out.println(text.substring(0, text.indexOf(" ", 100)) );
 		
 		TopDocs topDocs = searcher.search(query, 5);
 //		System.out.println("	hits:" + topDocs.totalHits);
 		
 		
 		for ( ScoreDoc scoreDoc : topDocs.scoreDocs ) {
 		    Document doc = searcher.doc( scoreDoc.doc );
 //		    System.out.print("	" + doc.getField("id").stringValue());
 		}
 //		System.out.println();
 		return (Calendar.getInstance().getTimeInMillis()-start);
 	}
    
    public static long searchIndex(IndexReader reader, String searchString) throws IOException, ParseException {
 		
    	long start = Calendar.getInstance().getTimeInMillis();
    	
    	IndexSearcher indexSearcher = new IndexSearcher(reader);

 		
 		QueryParser parser = new QueryParser(Version.LUCENE_40, "text", new StandardAnalyzer(Version.LUCENE_40));
 	    Query query = parser.parse(searchString);
 	    ScoreDoc[] hits = indexSearcher.search(query, null, 10).scoreDocs;
 	    
 //	    System.out.println("	hits:" + hits.length);
 		
 	    // Iterate through the results:
 	    for (int i = 0; i < hits.length; i++) {
 	      Document hitDoc = indexSearcher.doc(hits[i].doc);
 	    }
 	    
 	    return (Calendar.getInstance().getTimeInMillis()-start);

 	}
 	
 }
	package com.test.lucene;

	import java.io.File;
	import java.io.IOException;
	import java.io.StringReader;
	import java.util.Calendar;

	import org.apache.lucene.analysis.Analyzer;
	import org.apache.lucene.analysis.standard.StandardAnalyzer;
	import org.apache.lucene.document.Document;
	import org.apache.lucene.index.DirectoryReader;
	import org.apache.lucene.index.IndexReader;
	import org.apache.lucene.queries.mlt.MoreLikeThis;
	import org.apache.lucene.queryparser.classic.ParseException;
	import org.apache.lucene.queryparser.classic.QueryParser;
	import org.apache.lucene.search.IndexSearcher;
	import org.apache.lucene.search.Query;
	import org.apache.lucene.search.ScoreDoc;
	import org.apache.lucene.search.TopDocs;
	import org.apache.lucene.store.Directory;
	import org.apache.lucene.store.FSDirectory;
	import org.apache.lucene.store.IOContext;
	import org.apache.lucene.store.RAMDirectory;
	import org.apache.lucene.util.Version;

	import de.jetwick.snacktory.HtmlFetcher;
	import de.jetwick.snacktory.JResult;

	/**
	* Testing processing time of FSDirectory and RAMDirectory
	* using simple and MLT queries
	*
	*/
	public class App
	{
	// private static String[] pages = {
	// "http://insidetv.ew.com/2012/10/15/walking-dead-season-3-premiere-ratings/",
	// "http://techcrunch.com/2012/10/16/facebook-rd-goes-global-opens-engineering-office-in-london-its-first-outside-the-u-s/",
	// "http://www.bostondynamics.com/index.html",
	// "http://www.readwriteweb.com/biz/2012/10/how-hard-is-it-to-get-and-use-a-3d-printer.php",
	// "http://prettymuchamazing.com/news/album-stream-bat-for-lashes-the-haunted-man"};
	//

	private static String[] pages = {
	"http://www.lemonde.fr/sciences/article/2012/10/11/serge-haroche-il-y-a-une-contradiction-entre-le-temps-des-politiques-et-le-temps-de-la-recherche_1773818_1650684.html",
	"http://www.business-angel-france.com/business-angel-france-coup-de-gueule-faut-pas-prendre-les-business-angels-pour-des-mougeons",
	"http://www.larecherche.fr/content/actualite-sante/article?id=32457",
	"http://www.lesechos.fr/entreprises-secteurs/tech-medias/actu/0202329311759-open-data-l-etat-cherche-a-monetiser-les-donnees-publiques-500953.php",
	"http://www.usinenouvelle.com/article/les-pme-poumons-essentiels-de-l-economie-europeenne.N183637"};


	private static String[] terms = {
	"Zombie",
	"Vampire",
	"Nigth",
	"Wolf",
	"Human"};

	private static int factor = 1;
	private static String[] content = new String[pages.length];

	public static void main( String[] args )
	{

	System.out.println("Getting " + pages.length + " pages' content... ");
	for(int i =0; i<pages.length ; i++){
	JResult res;
	try {
	res = new HtmlFetcher().fetchAndExtract(pages[i], 10000, true);
	content[i] = res.getTitle() + " " + res.getText();
	} catch (Exception e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
	}
	}


	// creates the index

	try {
	String fs_path = "/Users/alejandra/Developments/apache-solr-4.0.0/example/solr/data/topics_fr/index";

	System.out.println("\nCreate FSIndex ...");
	FSDirectory fsi = FSDirectory.open(new File(fs_path));
	IndexReader reader = DirectoryReader.open(fsi);


	System.out.println("SearchMLT queries with FSIndex");
	long time = 0;
	for(String c: content)
	time+= searchMLT(reader, c);
	System.out.println("...processed " + (pages.length * factor) +" in " + time);

	System.out.println("Simple search with FSIndex");
	time = 0;
	for(String t: terms)
	time+= searchIndex(reader, t);
	System.out.println("...processed " + (terms.length * factor) +" in " + time);

	reader.close();

	/******* USE RAM DIR *********/

	System.out.println("\nCreating RAMDirectory... ");
	time = Calendar.getInstance().getTimeInMillis();
	RAMDirectory rami = new RAMDirectory(fsi, IOContext.DEFAULT);
	System.out.println("...created index in " + (Calendar.getInstance().getTimeInMillis() -time));

	reader = DirectoryReader.open(rami);

	System.out.println("SearchMLT queries with RAMDirectory ");
	time = 0;
	for(String c: content)
	time+= searchMLT(reader, c);
	System.out.println("...processed " + (pages.length * factor) +" in " + time);


	System.out.println("Simple search with RAMDirectory");
	time = 0;
	for(String t: terms)
	time+= searchIndex(reader, t);
	System.out.println("...processed " + (terms.length * factor) +" in " + time);


	reader.close();

	fsi.close();
	rami.close();

	} catch (IOException e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
	} catch (ParseException e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
	}



	}



	public static long searchMLT(IndexReader reader, String text) throws IOException{

	long start = Calendar.getInstance().getTimeInMillis();

	IndexSearcher searcher = new IndexSearcher(reader);
	MoreLikeThis mlt = new MoreLikeThis(reader); // Pass the index reader
	mlt.setAnalyzer(new StandardAnalyzer(Version.LUCENE_40));
	mlt.setFieldNames(new String[] {"text"});
	mlt.setMinWordLen(3);
	mlt.setMinTermFreq(3);
	mlt.setMinDocFreq(2);

	Query query = mlt.like( new StringReader(text), "text");

	// System.out.println(text.substring(0, text.indexOf(" ", 100)) );

	TopDocs topDocs = searcher.search(query, 5);
	// System.out.println(" hits:" + topDocs.totalHits);


	for ( ScoreDoc scoreDoc : topDocs.scoreDocs ) {
	Document doc = searcher.doc( scoreDoc.doc );
	// System.out.print(" " + doc.getField("id").stringValue());
	}
	// System.out.println();
	return (Calendar.getInstance().getTimeInMillis()-start);
	}

	public static long searchIndex(IndexReader reader, String searchString) throws IOException, ParseException {

	long start = Calendar.getInstance().getTimeInMillis();

	IndexSearcher indexSearcher = new IndexSearcher(reader);


	QueryParser parser = new QueryParser(Version.LUCENE_40, "text", new StandardAnalyzer(Version.LUCENE_40));
	Query query = parser.parse(searchString);
	ScoreDoc[] hits = indexSearcher.search(query, null, 10).scoreDocs;

	// System.out.println(" hits:" + hits.length);

	// Iterate through the results:
	for (int i = 0; i < hits.length; i++) {
	Document hitDoc = indexSearcher.doc(hits[i].doc);
	}

	return (Calendar.getInstance().getTimeInMillis()-start);

	}

	}
No results found