- 
      
- 
        Save johnconroy/1245277 to your computer and use it in GitHub Desktop. 
| package tfidf; | |
| import java.io.BufferedReader; | |
| import java.io.File; | |
| import java.io.FileNotFoundException; | |
| import java.io.FileReader; | |
| import java.io.FileWriter; | |
| import java.io.IOException; | |
| import java.text.DecimalFormat; | |
| import java.util.ArrayList; | |
| import java.util.Arrays; | |
| import java.util.HashMap; | |
| import java.util.HashSet; | |
| import java.util.Iterator; | |
| import java.util.List; | |
| import java.util.Set; | |
| public class makeTfidfLongvectors { | |
| public static int numDocuments; | |
| public ArrayList loadUserDocs() throws IOException{ | |
| //nb returns arraylist, each element is an array size 2 | |
| ArrayList userDocs= new ArrayList(); | |
| ArrayList tempUserDoc= new ArrayList(); | |
| // one document per line. format: [username**MARK**document content.....] | |
| String docPath="/path/to/documentsFile"; | |
| BufferedReader br= new BufferedReader(new FileReader(new File(docPath))); | |
| String line; | |
| String doc; | |
| String user; | |
| String[] userAndDoc; | |
| int countLine=0; | |
| int parseErrs=0; | |
| while ((line=br.readLine())!=null){ | |
| //System.out.println(line); | |
| try{ | |
| //each line contains the user's name, then their document, seperated by "**MARK**" | |
| userAndDoc=line.split("\\*\\*MARK\\*\\*"); | |
| user=userAndDoc[0]; | |
| doc=userAndDoc[1]; | |
| //System.out.println(user+doc); | |
| if (doc.length()>3){ | |
| userDocs.add(userAndDoc); | |
| } | |
| countLine++; | |
| }catch (Exception e){parseErrs++;} | |
| } | |
| System.out.println(parseErrs); | |
| System.out.println("Num lines: "+countLine); | |
| this.numDocuments=userDocs.size(); | |
| System.out.println("num docs: "+this.numDocuments); | |
| return userDocs; | |
| } | |
| public HashMap loadVocabMap() throws IOException{ | |
| //contains each unique word in the corpus, plus the number of documents it's found in. | |
| //format: [word frequency] | |
| //returned as a word:frequency map | |
| String vocabFilePath="/path/to/docFreqs.data"; | |
| HashMap<String,Integer> vocabCount=new HashMap(); | |
| String line=""; | |
| BufferedReader br= new BufferedReader(new FileReader(new File(vocabFilePath))); | |
| String[] thisWordAndFreq; | |
| String key; | |
| Integer value; | |
| while((line=br.readLine())!=null){ | |
| thisWordAndFreq=line.split(" "); | |
| key=thisWordAndFreq[0]; | |
| value=Integer.parseInt(thisWordAndFreq[1]); | |
| if (thisWordAndFreq[0].length()>2){ //ie if a word is actually there and not whitespace etc. | |
| vocabCount.put(key, value); | |
| } | |
| } | |
| return vocabCount; | |
| } | |
| public static void main(String[] args) throws IOException{ | |
| int count=0; | |
| make_tfidf_longvectors mtl= new make_tfidf_longvectors(); | |
| ArrayList vocabList= new ArrayList(); | |
| HashMap vocabAndFreq= mtl.loadVocabMap(); | |
| vocabList=mtl.makeVocabList(); //update vocabList defined in class | |
| System.out.println("vocab list size: "+vocabList.size()); | |
| ArrayList documents=mtl.loadUserDocs(); //rem that each elem is [[uname][doc]] | |
| ArrayList<Double> initDocMatrix; | |
| ArrayList docMatrices; | |
| ArrayList<Double> tfidfLongMatrix; | |
| String[] docSplit; | |
| String docStr; | |
| for(int i=0;i<documents.size();i++){ | |
| initDocMatrix=mtl.initialiseDocMatrix(vocabList); | |
| String[] thisDocList=(String[]) documents.get(i); | |
| String user=thisDocList[0]; | |
| String userDoc=thisDocList[1]; | |
| tfidfLongMatrix=makeTfidfMatrix(userDoc, vocabAndFreq, initDocMatrix,vocabList); | |
| mtl.writeLine(user, tfidfLongMatrix); | |
| if (i%500==0){ | |
| System.out.println(i+" of "+ documents.size()+" written"); | |
| } | |
| } | |
| } | |
| private void writeLine(String user, ArrayList<Double> tfidfLongMatrix) throws IOException { | |
| //writes tf-idf weighted vectors to file | |
| String matrixFilePath="/destinationFolder/tfidfVectors.data"; | |
| FileWriter fw=new FileWriter(matrixFilePath,true); | |
| fw.write(user+" "); | |
| DecimalFormat fourDForm = new DecimalFormat("#.#####"); | |
| Iterator iter= tfidfLongMatrix.iterator(); | |
| while (iter.hasNext()){ | |
| fw.write(String.valueOf(fourDForm.format(iter.next()))+" "); | |
| } | |
| fw.write("\n"); | |
| fw.close(); | |
| } | |
| private ArrayList makeVocabList() throws IOException{ | |
| //as well as vocab/frequency hashmap, i need an arraylist, which is used to ensure the placing of tf-idf scores in the same order in the vector. | |
| String vocabFilePath="C://datasets//twitter_data//sep11//forCossim//docFreqs_790-839.data"; | |
| ArrayList vocab=new ArrayList(); | |
| String line=""; | |
| BufferedReader br= new BufferedReader(new FileReader(new File(vocabFilePath))); | |
| String[] thisWordAndFreq; | |
| String word; | |
| while((line=br.readLine())!=null){ | |
| thisWordAndFreq=line.split(" "); | |
| word=thisWordAndFreq[0]; | |
| if (thisWordAndFreq[0].length()>2){ //ie if a word is actually there and not whitespace etc. | |
| vocab.add(word); | |
| } | |
| } | |
| return vocab; | |
| } | |
| private static ArrayList<Double> makeTfidfMatrix(String userDoc, HashMap vocabAndFreq, ArrayList<Double> docMatrix,ArrayList vocabList) { | |
| String[] docSplit=userDoc.split(" "); | |
| //find unique set of words | |
| Set<String> wordSet=new HashSet(Arrays.asList(docSplit)); | |
| Iterator setIter= wordSet.iterator(); | |
| int docLen=docSplit.length; | |
| int errs=0; | |
| while (setIter.hasNext()){ | |
| String word=(String) setIter.next(); | |
| try{ | |
| Double wordTfidfScore=getWordTfidf(word, docSplit, vocabAndFreq, docLen); | |
| //find place of that word in vocab | |
| int place=vocabList.indexOf(word); | |
| docMatrix.set(place, wordTfidfScore); | |
| }catch(Exception e){errs++;//ie word isn't in vocab. ie was a stop word etc. | |
| } | |
| } | |
| //System.out.println(errs); | |
| return docMatrix; | |
| } | |
| private static Double getWordTfidf(String word, String[] docSplit, HashMap vocabAndFreq, int docLen) { | |
| double tf=getTf(word, docSplit,docLen); | |
| double idf=getIdf(word, (Integer)vocabAndFreq.get(word)); | |
| double tfidf=tf*idf; | |
| return tfidf; | |
| } | |
| private static double getIdf(String word, int numDocsContainingWord) { | |
| return Math.log(((numDocuments*1.0)/numDocsContainingWord)); | |
| } | |
| private static double getTf(String word, String[] docSplit, int docLen) { | |
| //number of occurences of this word in document | |
| int termFreq=0; | |
| for(int k=0;k<docSplit.length;k++){ | |
| if (word==docSplit[k]){ | |
| termFreq++; | |
| } | |
| } | |
| return (termFreq/(float)docSplit.length); | |
| } | |
| private ArrayList initialiseDocMatrix(ArrayList vocabList) { | |
| //set up an initial vector of the correct size (the size of the corpus vocab.) comprised of zeros | |
| ArrayList initDocMatrix= new ArrayList(); | |
| for (int i=0;i<vocabList.size();i++){ | |
| initDocMatrix.add(0.0); | |
| } | |
| return initDocMatrix; | |
| } | |
| } | 
Function : getIdf(String word, int numDocsContainingWord)
Values passed : getIdf(word, (Integer)vocabAndFreq.get(word));
vocabAndFreq is vocab and its frequency eg. test,11
for idf you need document frequency. How will this code [ (Integer)vocabAndFreq.get(word)) ] work? It gives you no. of occurences/frequency. what if word is repeated 5 times in one document? In that case, you df would be incorrect if you use the code above.
Correct me, if I'm wrong.
String vocabFilePath="C://datasets//twitter_data//sep11//forCossim//docFreqs_790-839.data";       //(within makeVocabList())
String vocabFilePath="/path/to/docFreqs.data";                                                                                  //(within loadVocabMap())
what are these to do? can you explain ?
hello, I learn a lot from your code, but would tell me what does
String vocabFilePath="C://datasets//twitter_data//sep11//forCossim//docFreqs_790-839.data";  this  mean? and
String matrixFilePath="/destinationFolder/tfidfVectors.data"; and
String vocabFilePath="/path/to/docFreqs.data";?   please?  I need your anwser .
getting error in this line
make_tfidf_longvectors mtl= new make_tfidf_longvectors();
where do i find this method 'make_tfidf_longvectors()' or how do i use this make method()?
i can't able to run this code....can you suggest me few tips
import java.io.BufferedReader;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.text.DecimalFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Set;
public class makeTfidfLongvectors {
public static int numDocuments;
public ArrayList loadUserDocs() throws IOException{
	//nb returns arraylist, each element is an array size 2
	ArrayList userDocs= new ArrayList();
	ArrayList tempUserDoc= new ArrayList();
	
            // one document per line. format: [username**MARK**document content.....]
	String docPath="C:\\Users\\Kaz Innovations\\eclipse-workspace\\TFIDF Calculator\\documents\\dataset2.txt";
	BufferedReader br= new BufferedReader(new FileReader(new File(docPath)));
	
	String line;
	String doc;
	String user;
	String[] userAndDoc;
	int countLine=0;
	int parseErrs=0;
	
	while ((line=br.readLine())!=null){
		//System.out.println(line);
		try{
                    //each line contains the user's name, then their document, seperated by "**MARK**"
		userAndDoc=line.split("\\*\\*MARK\\*\\*");
		user=userAndDoc[0];
		doc=userAndDoc[1];
		//System.out.println(user+doc);
		//if (doc.length()>3){
			//userDocs.add(userAndDoc);
		//}
					
		countLine++;
		}catch (Exception e){parseErrs++;}
		
		
	}
	System.out.println(parseErrs);
	
	System.out.println("Num lines: "+countLine);
	this.numDocuments=userDocs.size();
	System.out.println("num docs: "+this.numDocuments);
	
	return userDocs; 
}
public HashMap loadVocabMap() throws IOException{
            //contains each unique word in the corpus, plus the number of documents it's found in.
            //format: [word frequency]
            //returned as a word:frequency map
	String vocabFilePath="C:\\Users\\Kaz Innovations\\eclipse-workspace\\TFIDF Calculator\\documents\\document.txt";
	
	HashMap<String,Integer> vocabCount=new HashMap();
	String line="";
	BufferedReader br= new BufferedReader(new FileReader(new File(vocabFilePath)));
	String[] thisWordAndFreq;
	String key;
	Integer value;
	while((line=br.readLine())!=null){
		thisWordAndFreq=line.split(" ");
		key=thisWordAndFreq[0];
		value=Integer.parseInt(thisWordAndFreq[1]);
		if (thisWordAndFreq[0].length()>2){ //ie if a word is actually there and not whitespace etc.
			vocabCount.put(key, value);
		}
	}
	return vocabCount;
	
}
public static void main(String[] args) throws IOException{
	int count=0;
	makeTfidfLongvectors mtl= new makeTfidfLongvectors();
	ArrayList vocabList= new ArrayList();
	
	HashMap vocabAndFreq= mtl.loadVocabMap();
	vocabList=mtl.makeVocabList(); //update vocabList defined in class
	System.out.println("vocab list size:  "+vocabList.size());
	ArrayList documents=mtl.loadUserDocs(); //rem that each elem is [[uname][doc]]
	ArrayList<Double> initDocMatrix;
	ArrayList docMatrices;
	ArrayList<Double> tfidfLongMatrix;
	String[] docSplit;
	String docStr;
	
	
	for(int i=0;i<documents.size();i++){
		
		initDocMatrix=mtl.initialiseDocMatrix(vocabList);
		
		String[] thisDocList=(String[]) documents.get(i);
		String user=thisDocList[0];
		String userDoc=thisDocList[1];
		tfidfLongMatrix=makeTfidfMatrix(userDoc, vocabAndFreq, initDocMatrix,vocabList);
		mtl.writeLine(user, tfidfLongMatrix);
		
		if (i%500==0){
			System.out.println(i+" of "+ documents.size()+" written");
		}
		
	}
	
}
private void writeLine(String user, ArrayList<Double> tfidfLongMatrix) throws IOException {
            //writes tf-idf weighted vectors to file
	String matrixFilePath="C:\\Users\\Kaz Innovations\\eclipse-workspace\\TFIDF Calculator\\documents\\tfidfVectors.txt";
	FileWriter fw=new FileWriter(matrixFilePath,true);
	fw.write(user+" ");
	DecimalFormat fourDForm = new DecimalFormat("#.#####");
	Iterator iter= tfidfLongMatrix.iterator();
	while (iter.hasNext()){
		fw.write(String.valueOf(fourDForm.format(iter.next()))+" ");
	}
	fw.write("\n");
	fw.close();	
	
}
private ArrayList makeVocabList() throws IOException{
            //as well as vocab/frequency hashmap, i need an arraylist, which is used to ensure the placing of tf-idf scores in the same order in the vector.               
	String vocabFilePath="C:\\\\Users\\\\Kaz Innovations\\\\eclipse-workspace\\\\TFIDF Calculator\\\\documents\\\\document.txt";
	ArrayList vocab=new ArrayList();
	String line="";
	BufferedReader br= new BufferedReader(new FileReader(new File(vocabFilePath)));
	String[] thisWordAndFreq;
	String word;
	
	
	while((line=br.readLine())!=null){
		thisWordAndFreq=line.split(" ");
		word=thisWordAndFreq[0];
		if (thisWordAndFreq[0].length()>2){ //ie if a word is actually there and not whitespace etc.
			vocab.add(word);
		}
	}
	return vocab;
	
}
private static ArrayList<Double> makeTfidfMatrix(String userDoc, HashMap vocabAndFreq, ArrayList<Double> docMatrix,ArrayList vocabList) {
	String[] docSplit=userDoc.split(" ");
	//find unique set of words
	Set<String> wordSet=new HashSet(Arrays.asList(docSplit));
	Iterator setIter= wordSet.iterator();
	int docLen=docSplit.length;
	int errs=0;
	while (setIter.hasNext()){
		String word=(String) setIter.next();
		try{
			Double wordTfidfScore=getWordTfidf(word, docSplit, vocabAndFreq, docLen);
			//find place of that word in vocab
			int place=vocabList.indexOf(word);
			docMatrix.set(place, wordTfidfScore);
			
		}catch(Exception e){errs++;//ie word isn't in vocab. ie was a stop word etc.
		}
		
	}
	//System.out.println(errs);
	return docMatrix;
}
private static Double getWordTfidf(String word, String[] docSplit, HashMap vocabAndFreq, int docLen) {
	double tf=getTf(word, docSplit,docLen);
	double idf=getIdf(word, (Integer)vocabAndFreq.get(word));
	double tfidf=tf*idf;
	
	
	return tfidf;
	
}
private static double getIdf(String word, int numDocsContainingWord) {
	
	
	return Math.log(((numDocuments*1.0)/numDocsContainingWord));
}
private static double getTf(String word, String[] docSplit, int docLen) {
	//number of occurences of this word in document
	int termFreq=0;
	for(int k=0;k<docSplit.length;k++){
		if (word==docSplit[k]){
			termFreq++;
		}
	}
	return (termFreq/(float)docSplit.length);
}
private ArrayList initialiseDocMatrix(ArrayList vocabList) {
            //set up an initial vector of the correct size (the size of the corpus vocab.) comprised of zeros
	ArrayList initDocMatrix= new ArrayList();
	for (int i=0;i<vocabList.size();i++){
		initDocMatrix.add(0.0);
		
	}
	return initDocMatrix;
}
}
see this i have given the input like this ... is there any problem in giving input if so please let me know
How to execute this. Where i would place the file path.