【自然语言处理】DeepLearning.AI-1-w1-Logistic Regression

发表于 2021-07-08 更新于 2021-09-26 分类于 DeepLearning.AI - Coursera - Lesson 1 阅读次数：

本文字数： 2.3k 阅读时长 ≈ 2 分钟

Learn to extract features from text into numerical vectors, then build a binary classifier for tweets using logistic regression!

数据预处理的一般方式

预处理（Preprocessing）
- 消除句柄和URL
- 分词
- 去除一些停用词，英文中如（and, is, a, on, etc.）
- 将单词转变为词干（Stemming），如 dancer, dancing, danced, 变为 'danc'
- 将英文大写转变为小写

# example
import nltk                                # Python library for NLP
import re                                  # library for regular expression operations
import string                              # for string operations

from nltk.corpus import stopwords          # module for stop words that come with NLTK
from nltk.stem import PorterStemmer        # module for stemming
from nltk.tokenize import word_tokenize   # module for tokenizing strings

tweet_text = 'My beautiful sunflowers on a sunny Friday morning off :) #sunflowers #favourites #happy #Friday off… https://t.co/3tfYom0N1i'

def preprocess(text, remove_tag=True, tokenize=True, remove_stop_p=True, steam=True, lower=True):
    """
    preprocess of text
    """
    print(text)
    if remove_tag:
        # remove old style retweet text "RT"
        text = re.sub(r'^RT[\s]+', '', text)
        # remove hyperlinks
        text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
        # remove hashtags
        # only removing the hash # sign from the word
        text = re.sub(r'#', '', text)
        print('\033[92m' + text)
        
    text_list = []
    if tokenize:
        nltk.download('punkt')
        text_list = word_tokenize(text)
        print('\033[94m' + str(text_list))
        
    text_clean = []
    if remove_stop_p:
        stopwords_english = stopwords.words('english')
        for word in text_list: # Go through every word in your tokens list
            if (word not in stopwords_english and  # remove stopwords
                word not in string.punctuation):  # remove punctuation
                text_clean.append(word)
        print('\033[92m' + str(text_clean))
        
    text_stem = [] 
    if steam:
        # Instantiate stemming class
        stemmer = PorterStemmer() 
        # Create an empty list to store the stems
        for word in text_clean:
            stem_word = stemmer.stem(word)  # stemming word
            text_stem.append(stem_word)  # append to the list
        print('\033[94m' + str(text_stem))
    
    text_lower = []
    if lower:
        for word in text_stem:
            text_lower.append(word.lower())
        print('\033[92m' + str(text_lower))
    return

preprocess(tweet_text)