Notes on data science and artificial intelligence v. text preprocessing

V. text preprocessing

Author: Chris Albon

Translator: Flying dragon

Agreement: CC BY-NC-SA 4.0

Word bag

image
# Loading Library
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Create text
text_data = np.array(['I love Brazil. Brazil!',
                      'Sweden is best',
                      'Germany beats both'])

# Creating feature matrix of word bag
count = CountVectorizer()
bag_of_words = count.fit_transform(text_data)

# Show feature matrix
bag_of_words.toarray()

'''
array([[0, 0, 0, 2, 0, 0, 1, 0],
       [0, 1, 0, 0, 0, 1, 0, 1],
       [1, 0, 1, 0, 1, 0, 0, 0]], dtype=int64) 
'''

# Get feature name
feature_names = count.get_feature_names()

# View feature names
feature_names

# ['beats', 'best', 'both', 'brazil', 'germany', 'is', 'love', 'sweden'] 

# Create data frame
pd.DataFrame(bag_of_words.toarray(), columns=feature_names)
beats best both brazil germany is love sweden
0 0 0 0 2 0 0 1 0
1 0 1 0 0 0 1 0 1
2 1 0 1 0 1 0 0 0

Parsing HTML

# Loading Library
from bs4 import BeautifulSoup

# Create some HTML code
html = "<div class='full_name'><span style='font-weight:bold'>Masego</span> Azra</div>"

# Parsing html
soup = BeautifulSoup(html, "lxml")

# Find the < div > with the "full_name" class to display the text
soup.find("div", { "class" : "full_name" }).text

# 'Masego Azra' 

Remove punctuation

# Loading Library
import string
import numpy as np

# Create text
text_data = ['Hi!!!! I. Love. This. Song....', 
             '10000% Agree!!!! #LoveIT', 
             'Right?!?!']

# Create a function and remove all punctuation using string.punctuation
def remove_punctuation(sentence: str) -> str:
    return sentence.translate(str.maketrans('', '', string.punctuation))

# Application function
[remove_punctuation(sentence) for sentence in text_data]

# ['Hi I Love This Song', '10000 Agree LoveIT', 'Right'] 

Remove stop word

# Loading Library
from nltk.corpus import stopwords

# The first time you need to download a collection of stop words
import nltk
nltk.download('stopwords')

'''
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chrisalbon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!

True 
'''

# Create word Tags
tokenized_words = ['i', 'am', 'going', 'to', 'go', 'to', 'the', 'store', 'and', 'park']

# Load stop words
stop_words = stopwords.words('english')

# Show stop words
stop_words[:5]

# ['i', 'me', 'my', 'myself', 'we'] 

# Remove stop word
[word for word in tokenized_words if word not in stop_words]

# ['going', 'go', 'store', 'park'] 

Replacement character

# Import library
import re

# Create text
text_data = ['Interrobang. By Aishwarya Henriette',
             'Parking And Going. By Karl Gautier',
             'Today Is The night. By Jarek Prakash']

# Remove period
remove_periods = [string.replace('.', '') for string in text_data]

# Display text
remove_periods

'''
['Interrobang By Aishwarya Henriette',
 'Parking And Going By Karl Gautier',
 'Today Is The night By Jarek Prakash'] 
'''

# Create function
def replace_letters_with_X(string: str) -> str:
    return re.sub(r'[a-zA-Z]', 'X', string)

# Application function
[replace_letters_with_X(string) for string in remove_periods]

'''
['XXXXXXXXXXX XX XXXXXXXXX XXXXXXXXX',
 'XXXXXXX XXX XXXXX XX XXXX XXXXXXX',
 'XXXXX XX XXX XXXXX XX XXXXX XXXXXXX'] 
'''

Stem extraction

image
# Loading Library
from nltk.stem.porter import PorterStemmer

# Create word Tags
tokenized_words = ['i', 'am', 'humbled', 'by', 'this', 'traditional', 'meeting']

Stem extraction simplifies words into stem by recognizing and deleting affixes (e.g. gerunds) while maintaining the basic meaning of words. The porter Stemmer of NLTK implements the widely used Porter Stemmer algorithm.

# Create extractor
porter = PorterStemmer()

# Application extractor
[porter.stem(word) for word in tokenized_words]

# ['i', 'am', 'humbl', 'by', 'thi', 'tradit', 'meet'] 

Remove blank

# Create text
text_data = ['   Interrobang. By Aishwarya Henriette     ',
             'Parking And Going. By Karl Gautier',
             '    Today Is The night. By Jarek Prakash   ']

# Remove blank
strip_whitespace = [string.strip() for string in text_data]

# Display text
strip_whitespace

'''
['Interrobang. By Aishwarya Henriette',
 'Parking And Going. By Karl Gautier',
 'Today Is The night. By Jarek Prakash'] 
'''

Part of speech tagging

# Loading Library
from nltk import pos_tag
from nltk import word_tokenize

# Create text
text_data = "Chris loved outdoor running"

# Part of speech tagger with pre training
text_tagged = pos_tag(word_tokenize(text_data))

# Display part of speech
text_tagged

# [('Chris', 'NNP'), ('loved', 'VBD'), ('outdoor', 'RP'), ('running', 'VBG')] 

The output is a list of tuples that contain tags for words and parts of speech. NLTK uses Penn Treebank part of speech tags.

Label Part of speech
NNP Proper noun, singular
NN Noun, singular or collective
RB adverb
VBD Verb, past tense
VBG Verb, gerund or present participle
JJ Adjective
PRP Personal pronoun

TF-IDF

image
# Loading Library
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Create text
text_data = np.array(['I love Brazil. Brazil!',
                      'Sweden is best',
                      'Germany beats both'])

# Creating TF IDF feature matrix
tfidf = TfidfVectorizer()
feature_matrix = tfidf.fit_transform(text_data)

# Show TF IDF characteristic matrix
feature_matrix.toarray()

'''
array([[ 0.        ,  0.        ,  0.        ,  0.89442719,  0.        ,
         0.        ,  0.4472136 ,  0.        ],
       [ 0.        ,  0.57735027,  0.        ,  0.        ,  0.        ,
         0.57735027,  0.        ,  0.57735027],
       [ 0.57735027,  0.        ,  0.57735027,  0.        ,  0.57735027,
         0.        ,  0.        ,  0.        ]]) 
'''

# Show TF IDF characteristic matrix
tfidf.get_feature_names()

# ['beats', 'best', 'both', 'brazil', 'germany', 'is', 'love', 'sweden'] 

# Create data frame
pd.DataFrame(feature_matrix.toarray(), columns=tfidf.get_feature_names())
beats best both brazil germany is love sweden
0 0.00000 0.00000 0.00000 0.894427 0.00000 0.00000 0.447214 0.00000
1 0.00000 0.57735 0.00000 0.000000 0.00000 0.57735 0.000000 0.57735
2 0.57735 0.00000 0.57735 0.000000 0.57735 0.00000 0.000000 0.00000

Text segmentation

# Loading Library
from nltk.tokenize import word_tokenize, sent_tokenize

# Create text
string = "The science of today is the technology of tomorrow. Tomorrow is today."

# Segmentation of text
word_tokenize(string)

'''
['The',
 'science',
 'of',
 'today',
 'is',
 'the',
 'technology',
 'of',
 'tomorrow',
 '.',
 'Tomorrow',
 'is',
 'today',
 '.'] 
'''

# Participle sentences
sent_tokenize(string)

# ['The science of today is the technology of tomorrow.', 'Tomorrow is today.'] 

Posted by Tentious on Sat, 07 Dec 2019 14:05:04 -0800