Table of Contents
Tokenizing
Tokenizing is the process of taking a text and breaking it into words or sentences.
Importing libraries, and defining the text
import nltk
from nltk import word_tokenize, sent_tokenize
nltk.download('punkt') # download punctuation list
text = "Do. Or do not. There is no try."
Break in sentences
sentences = sent_tokenize(text) # ['Do.', 'Or do not.', 'There is no try.']
Break in words
words = word_tokenize(text) # ['Do', '.', 'Or', 'do', 'not', '.', 'There', 'is', 'no', 'try', '.']
Use a comprehension to break each sentence in words
words_in_sentences = [ word_tokenize( sentence ) for sentence in sentences ] # [['Do', '.'], ['Or', 'do', 'not', '.'], ['There', 'is', 'no', 'try', '.']]
Removing Stop Words
from nltk.corpus import stopwords
nltk.download('stopwords') # will download a list of stop words
from string import punctuation
Creates a set of stop words with punctuation on it
custom_stop_words = set(stopwords.words('english')+list(punctuation)) # printed: {"'", 'off', 'our', 'having', … 'against', 'just', 'up', 'she', 'after', "that'll", 'y', 'yours', 'o', 't', 'weren', "hasn't", 'is', "mustn't", 'about', '&', '+', '$', 'whom', 'any', 'other'}
list_without_stop_words = [ word for word in word_tokenize(text) if word not in custom_stop_words] # ['Do', 'Or', 'There', 'try']
Identifying Bigrams
from nltk.collocations import *
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(list_without_stop_words)
sorted(finder.ngram_fd.items()) # prints bigrams and its frequencies sorted: [(('Do', 'Or'), 1), (('Or', 'There'), 1), (('There', 'try'), 1)]
Stemming
from nltk.stem.lancaster import LancasterStemmer another_text = "Some words a rooted the same way, because of their roots!" st = LancasterStemmer() stemmed_words = [ st.stem(word) for word in word_tokenize(another_text) ] # ['som', 'word', 'a', 'root', 'the', 'sam', 'way', ',', 'becaus', 'of', 'their', 'root', '!']
Parts-of-Speech (POS) Tagging
nltk.download('averaged_perceptron_tagger')
nltk.pos_tag(word_tokenize(text))
'''
[('Do', 'NNP'),
('.', '.'),
('Or', 'CC'),
('do', 'VBP'),
('not', 'RB'),
('.', '.'),
('There', 'EX'),
('is', 'VBZ'),
('no', 'DT'),
('try', 'NN'),
('.', '.')]
'''
Word Sense Disambiguation
nltk.download('wordnet')
from nltk.corpus import wordnet as wn # wordnet is a lexicon, something like thesaurus
for ss in wn.synsets('bass'):
print(ss, ss.definition())
'''
Synset('bass.n.01') the lowest part of the musical range
Synset('bass.n.02') the lowest part in polyphonic music
Synset('bass.n.03') an adult male singer with the lowest voice
Synset('sea_bass.n.01') the lean flesh of a saltwater fish of the family Serranidae
Synset('freshwater_bass.n.01') any of various North American freshwater fish with lean flesh (especially of the genus Micropterus)
Synset('bass.n.06') the lowest adult male singing voice
Synset('bass.n.07') the member with the lowest range of a family of musical instruments
Synset('bass.n.08') nontechnical name for any of numerous edible marine and freshwater spiny-finned fishes
Synset('bass.s.01') having or denoting a low vocal or instrumental range
'''
from nltk.wsd import lesk
sense1 = lesk(word_tokenize("Sing in a lower tone, along with the bass"), "bass")
print(sense1, sense1.definition())
# Synset('bass.n.07') the member with the lowest range of a family of musical instruments
sense2 = lesk(word_tokenize("This sea bass was really hard to catch!"), "bass")
print(sense2, sense2.definition())
# Synset('bass.n.07') the lean flesh of a saltwater fish of the family Serranidae

What do you think?