Using Python NLTK (Natural Language Toolkit)

By Fernando Rodrigues February 15, 2018April 13, 2018 In Cheat Sheet Series, Natural Language Processing, Python 0 nltk, python 0

Table of Contents

1 Tokenizing
2 Removing Stop Words
3 Identifying Bigrams
4 Stemming
5 Parts-of-Speech (POS) Tagging
6 Word Sense Disambiguation

Tokenizing

Tokenizing is the process of taking a text and breaking it into words or sentences.

Importing libraries, and defining the text

import nltk
from nltk import word_tokenize, sent_tokenize
nltk.download('punkt') # download punctuation list

text = "Do. Or do not. There is no try."

Break in sentences

sentences = sent_tokenize(text) # ['Do.', 'Or do not.', 'There is no try.']

Break in words

words = word_tokenize(text) # ['Do', '.', 'Or', 'do', 'not', '.', 'There', 'is', 'no', 'try', '.']

Use a comprehension to break each sentence in words

words_in_sentences = [ word_tokenize( sentence ) for sentence in sentences ] # [['Do', '.'], ['Or', 'do', 'not', '.'], ['There', 'is', 'no', 'try', '.']]

Removing Stop Words

from nltk.corpus import stopwords
nltk.download('stopwords') # will download a list of stop words
from string import punctuation

Creates a set of stop words with punctuation on it

custom_stop_words = set(stopwords.words('english')+list(punctuation)) # printed: {"'", 'off', 'our', 'having', … 'against', 'just', 'up', 'she', 'after', "that'll", 'y', 'yours', 'o', 't', 'weren', "hasn't", 'is', "mustn't", 'about', '&', '+', '$', 'whom', 'any', 'other'}
list_without_stop_words = [ word for word in word_tokenize(text) if word not in custom_stop_words] # ['Do', 'Or', 'There', 'try']

Identifying Bigrams

from nltk.collocations import *

bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(list_without_stop_words)
sorted(finder.ngram_fd.items()) # prints bigrams and its frequencies  sorted: [(('Do', 'Or'), 1), (('Or', 'There'), 1), (('There', 'try'), 1)]

Stemming

from nltk.stem.lancaster import LancasterStemmer

another_text = "Some words a rooted the same way, because of their roots!"
st = LancasterStemmer()
stemmed_words = [ st.stem(word) for word in word_tokenize(another_text) ] # ['som', 'word', 'a', 'root', 'the', 'sam', 'way', ',', 'becaus', 'of', 'their', 'root', '!']

Parts-of-Speech (POS) Tagging

nltk.download('averaged_perceptron_tagger')
nltk.pos_tag(word_tokenize(text))
'''
[('Do', 'NNP'),
 ('.', '.'),
 ('Or', 'CC'),
 ('do', 'VBP'),
 ('not', 'RB'),
 ('.', '.'),
 ('There', 'EX'),
 ('is', 'VBZ'),
 ('no', 'DT'),
 ('try', 'NN'),
 ('.', '.')]
'''

Word Sense Disambiguation

nltk.download('wordnet')
from nltk.corpus import wordnet as wn # wordnet is a lexicon, something like thesaurus

for ss in wn.synsets('bass'):
    print(ss, ss.definition())

'''
Synset('bass.n.01') the lowest part of the musical range
Synset('bass.n.02') the lowest part in polyphonic music
Synset('bass.n.03') an adult male singer with the lowest voice
Synset('sea_bass.n.01') the lean flesh of a saltwater fish of the family Serranidae
Synset('freshwater_bass.n.01') any of various North American freshwater fish with lean flesh (especially of the genus Micropterus)
Synset('bass.n.06') the lowest adult male singing voice
Synset('bass.n.07') the member with the lowest range of a family of musical instruments
Synset('bass.n.08') nontechnical name for any of numerous edible marine and freshwater spiny-finned fishes
Synset('bass.s.01') having or denoting a low vocal or instrumental range
'''

from nltk.wsd import lesk

sense1 = lesk(word_tokenize("Sing in a lower tone, along with the bass"), "bass")
print(sense1, sense1.definition())
# Synset('bass.n.07') the member with the lowest range of a family of musical instruments

sense2 = lesk(word_tokenize("This sea bass was really hard to catch!"), "bass")
print(sense2, sense2.definition())
# Synset('bass.n.07') the lean flesh of a saltwater fish of the family Serranidae