【NLP】初学自然语言处理

  跟着Bag of Words Meets Bags of Popcorn的初学者实例,敲了一遍代码。主要用到的是CountVectorizer,生成每个评论的词频向量,然后利用随机森林建立模型,对新的评论进行预测。提交之后,分数大概为0.84。

  

import pandas as pd
import re
from bs4 import BeautifulSoup
import logging

logging.basicConfig(level=logging.ERROR)

train = pd.read_csv('/Users/meitu/Downloads/labeledTrainData.tsv', header=0, \
                    delimiter="\t", quoting=3)
print(train['sentiment'].head(10))

# print(train.shape)
# print(train.columns.values)
# print(train['review'][0])
example1 = BeautifulSoup(train['review'][0])
# print(example1.get_text())

letters_only = re.sub('[^a-zA-z]', " ", example1.get_text())
print(letters_only)
lower_case = letters_only.lower()
words = lower_case.split()
print(words)

from nltk.corpus import stopwords

# print(stopwords.words("english"))

words = [w for w in words if not w in stopwords.words('english')]
print(words)


def review_to_words(raw_review):
    # 1. Remove HTML
    review_text = BeautifulSoup(raw_review).get_text()

    # 2. Remove non-letters
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)

    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()

    # 4. In Python, searching a set is much faster than searching a list
    stops = set(stopwords.words('english'))

    # 5. Remove stop words
    meaningful_words = [w for w in words if not w in stops]

    # 6. Join the words back into one string separated by space, and return the result
    return (" ".join(meaningful_words))


if __name__ == '__main__':
    num_reviews = train['review'].size
    clean_train_reviews = []
    for i in range(0, num_reviews):
        clean_train_reviews.append(review_to_words(train['review'][i]))
        if (i + 1) % 1000 == 0:
            print("review %d of %d\n", i + 1, num_reviews)

    print("Creating the bag of words...\n")
    from sklearn.feature_extraction.text import CountVectorizer

    vectorzer = CountVectorizer(analyzer="word",
                                tokenizer=None,
                                stop_words=None,
                                max_features=5000)

    train_data_features = vectorzer.fit_transform(clean_train_reviews)
   # print(train_data_features[0])

    print(vectorzer.get_feature_names())
    train_data_features = train_data_features.toarray()
    print(train_data_features)


    vocab = vectorzer.get_feature_names()
    print(vocab)

    print("Training the random forest...")
    from sklearn.ensemble import RandomForestClassifier

    forest = RandomForestClassifier(n_estimators=100)

    forest = forest.fit(train_data_features, train['sentiment'])

    test = pd.read_csv('/Users/meitu/Downloads/testData.tsv', header=0, \
                       delimiter="\t", quoting=3)
    print(test.shape)

    num_reviews = len(test['review'])
    clean_test_reviews = []

    for i in range(0, num_reviews):
        if (i + 1) % 1000 == 0:
            print("Review %d of %d\n" % (i + 1, num_reviews))
        clean_review = review_to_words(test['review'][i])
        clean_test_reviews.append(clean_review)

    test_data_features = vectorzer.transform(clean_test_reviews)
    test_data_features = test_data_features.toarray()

    result = forest.predict(test_data_features)

    output = pd.DataFrame(data={'id': test['id'], 'sentiment': result})

    output.to_csv('bag_of_word_model.csv', index=False, quoting=3)
    # test_data_features = vec

 

posted @ 2017-10-24 09:57  Fall12  阅读(363)  评论(0编辑  收藏  举报