Created
February 13, 2018 17:57
-
-
Save Jawn78/bab736aeeb9c5b0f64d99e7aceeb1d98 to your computer and use it in GitHub Desktop.
Revisions
-
Jawn78 created this gist
Feb 13, 2018 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,89 @@ /* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ package rex1nlp; import java.io.File; import java.io.FileInputStream; import java.io.InputStream; import opennlp.tools.namefind.NameFinderME; import opennlp.tools.namefind.TokenNameFinderModel; import opennlp.tools.tokenize.TokenizerME; import opennlp.tools.tokenize.TokenizerModel; import opennlp.tools.util.Span; import org.apache.tika.metadata.Metadata; import org.apache.tika.parser.AutoDetectParser; import org.apache.tika.parser.ParseContext; import org.apache.tika.parser.Parser; import org.apache.tika.sax.BodyContentHandler; import org.xml.sax.ContentHandler; public class rexNERex { public static void main(String args[]) throws Exception{ //Getting the sentence in the form of String array String target = "C:\\Users\\RexPC\\Documents\\Haily.docx"; File document = new File(target); Parser parser = new AutoDetectParser(); ContentHandler handler = new BodyContentHandler(); Metadata metadata = new Metadata(); parser.parse(new FileInputStream(document), handler, metadata, new ParseContext()); // System.out.println(handler); //Loading the tokenizer model InputStream inputStreamTokenizer = new FileInputStream("C:\\Users\\RexPC\\Documents\\Programming\\Apache OpenNLP\\Models\\Original OpenNLP Models\\en-token.bin"); TokenizerModel tokenModel = new TokenizerModel(inputStreamTokenizer); //Instantiating the TokenizerME class TokenizerME tokenizer = new TokenizerME(tokenModel); //Tokenizing the sentence in to a string array String tokens[] = tokenizer.tokenize(handler.toString()); for(String tokenin: tokens) System.out.println(tokenin); InputStream inputStreamNameFinder = new FileInputStream("C:\\Users\\RexPC\\Documents\\Programming\\Apache OpenNLP\\Models\\Original OpenNLP Models\\en-ner-person.bin"); TokenNameFinderModel model = new TokenNameFinderModel(inputStreamNameFinder); //Instantiating the NameFinderME class NameFinderME nameFinder = new NameFinderME(model); //Finding the names in the sentence Span nameSpans[] = nameFinder.find(tokens); //Printing the names and their spans in a sentence // for(Span s: nameSpans) // System.out.println(s.toString()); /* InputStream modelIn = new FileInputStream("C:\\Users\\RexPC\\Documents\\Programming\\Apache OpenNLP\\Models\\Original OpenNLP Models\\en-sent.bin"); SentenceModel stcmodel = null; try { stcmodel = new SentenceModel(modelIn); } catch (IOException e) { } //Instantiating the SentenceDetectorME class SentenceDetectorME detector = new SentenceDetectorME(stcmodel); String sentences[]; sentences = detector.sentDetect(handler.toString()); //Finding the names in the sentence Span nameSpans[] = nameFinder.find(sentences); //Printing the spans of the names in the sentence for(Span s: nameSpans) System.out.println(s.toString()); */ } }