Table of Contents
Tokenizing
Tokenizing is the process of taking a text and breaking it into words or sentences.
Importing libraries, and defining the text
import nltk from nltk import word_tokenize, sent_tokenize nltk.download('punkt') # download punctuation list text = "Do. Or do not. There is no try."
Break in sentences
sentences = sent_tokenize(text) # ['Do.', 'Or do not.', 'There is no try.']
Break in words
words = word_tokenize(text) # ['Do', '.', 'Or', 'do', 'not', '.', 'There', 'is', 'no', 'try', '.']
Use a comprehension to break each sentence in words
words_in_sentences = [ word_tokenize( sentence ) for sentence in sentences ] # [['Do', '.'], ['Or', 'do', 'not', '.'], ['There', 'is', 'no', 'try', '.']]
Removing Stop Words
from nltk.corpus import stopwords nltk.download('stopwords') # will download a list of stop words from string import punctuation
Creates a set of stop words with punctuation on it
custom_stop_words = set(stopwords.words('english')+list(punctuation)) # printed: {"'", 'off', 'our', 'having', … 'against', 'just', 'up', 'she', 'after', "that'll", 'y', 'yours', 'o', 't', 'weren', "hasn't", 'is', "mustn't", 'about', '&', '+', '$', 'whom', 'any', 'other'} list_without_stop_words = [ word for word in word_tokenize(text) if word not in custom_stop_words] # ['Do', 'Or', 'There', 'try']
Identifying Bigrams
from nltk.collocations import * bigram_measures = nltk.collocations.BigramAssocMeasures() finder = BigramCollocationFinder.from_words(list_without_stop_words) sorted(finder.ngram_fd.items()) # prints bigrams and its frequencies sorted: [(('Do', 'Or'), 1), (('Or', 'There'), 1), (('There', 'try'), 1)]
Stemming
from nltk.stem.lancaster import LancasterStemmer another_text = "Some words a rooted the same way, because of their roots!" st = LancasterStemmer() stemmed_words = [ st.stem(word) for word in word_tokenize(another_text) ] # ['som', 'word', 'a', 'root', 'the', 'sam', 'way', ',', 'becaus', 'of', 'their', 'root', '!']
Parts-of-Speech (POS) Tagging
nltk.download('averaged_perceptron_tagger') nltk.pos_tag(word_tokenize(text)) ''' [('Do', 'NNP'), ('.', '.'), ('Or', 'CC'), ('do', 'VBP'), ('not', 'RB'), ('.', '.'), ('There', 'EX'), ('is', 'VBZ'), ('no', 'DT'), ('try', 'NN'), ('.', '.')] '''
Word Sense Disambiguation
nltk.download('wordnet') from nltk.corpus import wordnet as wn # wordnet is a lexicon, something like thesaurus for ss in wn.synsets('bass'): print(ss, ss.definition()) ''' Synset('bass.n.01') the lowest part of the musical range Synset('bass.n.02') the lowest part in polyphonic music Synset('bass.n.03') an adult male singer with the lowest voice Synset('sea_bass.n.01') the lean flesh of a saltwater fish of the family Serranidae Synset('freshwater_bass.n.01') any of various North American freshwater fish with lean flesh (especially of the genus Micropterus) Synset('bass.n.06') the lowest adult male singing voice Synset('bass.n.07') the member with the lowest range of a family of musical instruments Synset('bass.n.08') nontechnical name for any of numerous edible marine and freshwater spiny-finned fishes Synset('bass.s.01') having or denoting a low vocal or instrumental range '''
from nltk.wsd import lesk sense1 = lesk(word_tokenize("Sing in a lower tone, along with the bass"), "bass") print(sense1, sense1.definition()) # Synset('bass.n.07') the member with the lowest range of a family of musical instruments sense2 = lesk(word_tokenize("This sea bass was really hard to catch!"), "bass") print(sense2, sense2.definition()) # Synset('bass.n.07') the lean flesh of a saltwater fish of the family Serranidae
What do you think?