#A Collection of NLP notes
##N-grams
###Calculating unigram probabilities:
P( wi ) = count ( wi ) ) / count ( total number of words )
In english..
| SPC | |
| SPC: find file | |
| , switch buffer | |
| . browse files | |
| : MX | |
| ; EX | |
| < switch buffer | |
| ` eval | |
| u universal arg | |
| x pop up scratch |
| # | |
| # stopwords.txt | |
| # | |
| # Freely available stopword list, balancing coverage and size. | |
| # | |
| # From http://www.lextek.com/manuals/onix/stopwords1.html | |
| a | |
| about | |
| above | |
| across |
| #!/usr/bin/python | |
| """Python script to create a histogram of words in a text file. | |
| Usage: python word_frequency.py -f "/path/to/file.txt" -n 200 | |
| Specify the path to the text file as above. Manually specify the top N words to report (default 100). | |
| Text file can contain punctuation, new lines, etc., but special characters aren't handled well. |
| """Information Retrieval metrics | |
| Useful Resources: | |
| http://www.cs.utexas.edu/~mooney/ir-course/slides/Evaluation.ppt | |
| http://www.nii.ac.jp/TechReports/05-014E.pdf | |
| http://www.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf | |
| http://hal.archives-ouvertes.fr/docs/00/72/67/60/PDF/07-busa-fekete.pdf | |
| Learning to Rank for Information Retrieval (Tie-Yan Liu) | |
| """ | |
| import numpy as np |
| #!/usr/bin/env python | |
| # -*- coding: utf-8 -*- | |
| # @Date : 2017-03-20 07:15:37 | |
| # @Author : Anurag Roy ([email protected]) | |
| # @Link : ranarag.github.io | |
| # @Version : 1.0.0 | |
| import os | |
| import tensorflow as tf | |
| from sklearn.metrics.pairwise import cosine_similarity |
| import tensorflow as tf | |
| import numpy as np | |
| corpus_raw = 'He is the king . The king is royal . She is the royal queen ' | |
| # convert to lower case | |
| corpus_raw = corpus_raw.lower() | |
| words = [] | |
| for word in corpus_raw.split(): |
#A Collection of NLP notes
##N-grams
###Calculating unigram probabilities:
P( wi ) = count ( wi ) ) / count ( total number of words )
In english..
| import tensorflow as tf | |
| import numpy as np | |
| class TextCNN(object): | |
| """ | |
| A CNN for text classification. | |
| Uses an embedding layer, followed by a convolutional, max-pooling and softmax layer. | |
| """ | |
| def __init__( |
| # Typical setup to include TensorFlow. | |
| import tensorflow as tf | |
| # Make a queue of file names including all the JPEG images files in the relative | |
| # image directory. | |
| filename_queue = tf.train.string_input_producer( | |
| tf.train.match_filenames_once("./images/*.jpg")) | |
| # Read an entire image file which is required since they're JPEGs, if the images | |
| # are too large they could be split in advance to smaller files or use the Fixed |
| <?xml version="1.0" encoding="UTF-8"?> | |
| <!-- | |
| Copyright (C) 2014 Leo Iannacone <[email protected]> | |
| This file was generated from a textmate theme named Monokai Extended | |
| with tm2gtksw2 tool. (Alexandre da Silva) | |
| This library is free software; you can redistribute it and/or | |
| modify it under the terms of the GNU Library General Public | |
| License as published by the Free Software Foundation; either |