第一次参加NLP的比赛，也算是了解一下文本相关

比赛地址：https://god.yanxishe.com/44

之间参与的比赛都是数据类型的比赛，从来没接触过文本类型的任务。对文本的处理还不了解，现在稍微熟悉了一点。比赛给出的数据是weibo的一段评论数据，标注了每段评论数据的情感倾向，有正负未知三种类别。比赛目标就是对文本进行分类，评价标准是准确度。

文本分类的基本步骤

和一般的分类任务相同，文本比赛的处理流程也是文本向量化（如TF-IDF，词频统计），然后进行特征提取，放入分类模型，参数调优，提升准确率。

数据导入

import pandas as pd
import numpy as np
import xgboost as xgb
from tqdm import tqdm
from sklearn.svm import SVC
from keras.models import Sequential
from keras.layers.recurrent import LSTM, GRU
from keras.layers.core import Dense, Activation, Dropout
from keras.layers.embeddings import Embedding
from keras.layers.normalization import BatchNormalization
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from keras.layers import GlobalMaxPooling1D, Conv1D, MaxPooling1D, Flatten, Bidirectional, SpatialDropout1D
from keras.preprocessing import sequence, text
from keras.callbacks import EarlyStopping
from nltk import word_tokenize

df_train = pd.read_csv('./train.csv',sep='\t')  # 数据读取
df_test = pd.read_csv('./test.csv',sep='\t')
print(df_train.shape)
print(df_test.shape)

df_train.info()

导入数据后，简单查看数据，剔除或补全异常值，同时我们对label进行编码，导入中文停用词表。中文停用词指的是频繁出现在语句中的助词，对于词向量的转换没有意义，因此在分词之后需要剔除。

df_train.stance = df_train.stance.fillna('NONE')
df_train = df_train.reset_index(drop=True)

label_unique = df_train['stance'].unique()
nb_class = len(label_unique)

label_dict = {'AGAINST':0,'FAVOR':1,'NONE':2}

label_dict2 = {0:'AGAINST',1:'FAVOR',2:'NONE'}

with open('./stopwords.txt',encoding='utf8') as f:
    content = f.read()
    stopWordList = content.splitlines()
len(stopWordList)

分词

中文分词和英文分词不同，后者的划分很大程度上是单词的划分，中文的分词需要依靠分词工具，这里选用的jieba分词，同时对训练集标签进行映射。

df_train['label'] = df_train['stance'].apply(lambda x : label_dict.get(x))
df_train['cut_text'] = df_train['text'].apply(lambda x : " ".join(jieba.cut(x,cut_all=False)))
df_test['cut_text'] = df_test['text'].apply(lambda x : " ".join(jieba.cut(x,cut_all=False)))

train_text = list(df_train['cut_text'].values)
test_text = list(df_test['cut_text'].values)
totle_text = train_text + test_text

值得注意的是，分词是任何中文文本分类的起点，分词的质量会直接影响到后面的模型效果。在这里，作为演示有点偷懒，其实你还可以：

设置可靠的自定义词典，以便分词更精准；
采用分词效果更好的分词器，如pyltp、THULAC、Hanlp等；
编写预处理类，就像下面要谈到的数字特征归一化，去掉文本中的#@￥%……&等等。

TF-IDF

TF-IDF（Term Frequency-InversDocument Frequency）是一种常用于信息处理和数据挖掘的加权技术。该技术采用一种统计方法，根据字词的在文本中出现的次数和在整个语料中出现的文档频率来计算一个字词在整个语料中的重要程度。它的优点是能过滤掉一些常见的却无关紧要本的词语，同时保留影响整个文本的重要字词。

简单来说，就是利用词频和文本频率来计算这个词在整个语料库的重要性，将词语转换为数字，进而判断整个句子的类别。

Scikit-Learn中TF-IDF权重计算方法主要用到两个类：CountVectorizer和TfidfTransformer。

vectorizer = TfidfVectorizer(ngram_range=(1, 2),max_df=0.5,stop_words=stopWordList)
vectorizer.fit(totle_text)  # 构造tfidf矩阵
df_train['label'] = df_train['stance'].map(label_dict)
X = vectorizer.transform(train_text)
y = df_train['label'].values
X_test = vectorizer.transform(test_text)

#创建一个向量计数器对象
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(totle_text)
#使用向量计数器对象转换训练集和验证集
xtrain_count =  count_vect.transform(train_text)
xvalid_count =  count_vect.transform(test_text)

模型

朴素贝叶斯

skf = StratifiedKFold(n_splits=5,random_state=42,shuffle=True)

oof_train = np.zeros((len(df_train),nb_class))
oof_test = np.zeros((len(df_test),nb_class))

for idx,(tr_in,te_in) in enumerate(skf.split(X,y)):
    X_train = X[tr_in]
    X_valid = X[te_in]
    y_train = y[tr_in]
    y_valid = y[te_in]
    
    clf = MultinomialNB(alpha=.25)
#     clf = LinearSVC()
    clf.fit(X_train,y_train)
    y_pred = clf.predict_proba(X_valid)
    
    oof_train[te_in] = y_pred
    oof_test = oof_test + clf.predict_proba(X_test) / skf.n_splits

# 使用包大人推荐的方法
x1 = np.array(oof_train)
y1 = np.array(y)
from scipy import optimize
def fun(x):
    tmp = np.hstack([x[0] * x1[:, 0].reshape(-1, 1), x[1] * x1[:, 1].reshape(-1, 1), x[2] * x1[:, 2].reshape(-1, 1)])
    return - accuracy_score(y1, np.argmax(tmp, axis=1))
x0 = np.asarray((0,0,0))
res = optimize.fmin_powell(fun, x0)

xx_score = accuracy_score(y,np.argmax(oof_train,axis=1))
print('原始score',xx_score)

xx_cv = accuracy_score(y,np.argmax(oof_train * res,axis=1))
print('修正后的',xx_cv)

result = df_test[['text']].copy()
result['stance'] = np.argmax(oof_test * res,axis=1)
result['label'] = result['stance'].map(label_dict2)
result[['label']].to_csv('./baseline_NB_{}.csv'.format(str(xx_cv).split('.')[1]),header=None,)

SVM

skf = StratifiedKFold(n_splits=5,random_state=42,shuffle=True)

oof_train = np.zeros((len(df_train),nb_class))
oof_test = np.zeros((len(df_test),nb_class))

for idx,(tr_in,te_in) in enumerate(skf.split(X,y)):
    X_train = X[tr_in]
    X_valid = X[te_in]
    y_train = y[tr_in]
    y_valid = y[te_in]
    
#     clf = MultinomialNB(alpha=.25)
    clf = SVC(C=1.0, probability=True) # since we need probabilities
#     clf = LinearSVC()
    clf.fit(X_train,y_train)
    y_pred = clf.predict_proba(X_valid)
    
    oof_train[te_in] = y_pred
    oof_test = oof_test + clf.predict_proba(X_test) / skf.n_splits

# 使用包大人推荐的方法
x1 = np.array(oof_train)
y1 = np.array(y)
from scipy import optimize
def fun(x):
    tmp = np.hstack([x[0] * x1[:, 0].reshape(-1, 1), x[1] * x1[:, 1].reshape(-1, 1), x[2] * x1[:, 2].reshape(-1, 1)])
    return - accuracy_score(y1, np.argmax(tmp, axis=1))
x0 = np.asarray((0,0,0))
res = optimize.fmin_powell(fun, x0)

xx_score = accuracy_score(y,np.argmax(oof_train,axis=1))
print('原始score',xx_score)

xx_cv = accuracy_score(y,np.argmax(oof_train * res,axis=1))
print('修正后的',xx_cv)

clf = SVC(C=1.0, probability=True) # since we need probabilities
clf.fit(X, y)
predictions = clf.predict_proba(X_test)

# print ("accuracy: %0.6f " % accuracy(predictions.argmax(axis=-1),np.array(ans_final['label'])))

result = df_test[['text']].copy()
result['stance'] = np.argmax(predictions,axis=-1)
result['label'] = result['stance'].map(label_dict2)
result[['label']].to_csv('./SVM_{}.csv'.format(str(xx_cv).split('.')[1]),header=None,)

LGB

param 
= { 
    'boosting_type': 'gbdt',  
    'objective': 'multiclass',  
    'num_class': 3,  
    'metric': 'multi_error',  
    'num_leaves': 300,  
#     'min_data_in_leaf': 500,  
    'learning_rate': 0.01,  
    'feature_fraction': 0.8,  
    'bagging_fraction': 0.8,  
#     'bagging_freq': 5, 
    'lambda_l1': 0.4,  
    'lambda_l2': 0.5,  
#     'min_gain_to_split': 0.2,  
    'verbose': -1,
#     'num_threads':4,
}

num_class = 3
X_train = X
y_train = y
 
# 五折交叉验证
folds = KFold(n_splits=5, shuffle=False, random_state=2019)
oof = np.zeros([X_train.shape[0],num_class])
predictions = np.zeros([X_test.shape[0],num_class])
 
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_train, y_train)):
    print("fold n°{}".format(fold_+1))
    trn_data = lgb.Dataset(X_train[trn_idx], y_train[trn_idx])
    val_data = lgb.Dataset(X_train[val_idx], y_train[val_idx])
 
    num_round = 1000
    clf = lgb.train(param, 
                    trn_data, 
                    num_round, 
                    valid_sets = [trn_data, val_data], 
                    verbose_eval = 100, 
                    early_stopping_rounds = 100)
    #oof[val_idx] = clf.predict(X_train[val_idx], num_iteration=clf.best_iteration)    
    predictions += clf.predict(X_test, num_iteration=clf.best_iteration) / folds.n_splits
    #print(predictions)
print(predictions)

df_test['label'] = predictions.argmax(axis=-1)
df_test['stance'] = df_test['label'].map(label_dict2)

df_result = df_test.loc[:, ['stance']]
df_result.to_csv('./lightgbm_.csv',header=None)

数据处理

使用奇异值分解（SVD）来减少TF-IDF的特征数量，同时将数据标准化。

svd = decomposition.TruncatedSVD(n_components=150)
svd.fit(xtrain_tfv)
xtrain_svd = svd.transform(xtrain_tfv)
xtest_svd = svd.transform(xtest_tfv)

scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)
xtrain_svd_scl = scl.transform(xtrain_svd)
xtest_svd_scl = scl.transform(xtest_svd)

clf = SVC(C=1.0, probability=True) # since we need probabilities
clf.fit(xtrain_svd_scl, y)
predictions = clf.predict_proba(xtest_svd_scl)

print ("accuracy: %0.6f " % accuracy(predictions.argmax(axis=-1),np.array(ans_final['label'])))

clf = SVC(C=1.0, probability=True) # since we need probabilities
clf.fit(xtrain_tfv, y)
predictions = clf.predict_proba(xtest_tfv)

print ("accuracy: %0.6f " % accuracy(predictions.argmax(axis=-1),np.array(ans_final['label'])))

词嵌入Word2Vec

将高维稀疏的词向量转移到低维空间，相比较于TF-IDF，效果可能更好。

train_text = list(df_train['cut_text'].values)
test_text = list(df_test['cut_text'].values)
totle_text = train_text + test_text

all_text = []
for i in totle_text:
    tmp = i.split(sep=' ')
    all_text.extend(tmp)

import gensim
model = gensim.models.Word2Vec(all_text, size=200, iter=10, sg=1, window=5,  min_count=5,  negative=3, sample=0.001, hs=1, workers=4)  
# (sentences, sg=1, size=100,  window=5,  min_count=5,  negative=3, sample=0.001, hs=1, workers=4)  

embeddings_index = dict(zip(model.wv.index2word, model.wv.syn0))

print('Found %s word vectors.' % len(embeddings_index))

def sent2vec(s):
    words = str(s).lower()
    words = word_tokenize(words)
#     words = [w for w in words if not w in stopWordList]
#     words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(200)
    return v / np.sqrt((v ** 2).sum())

def get_contentVector(cutWords, word2vec_model):
    vector_list = [word2vec_model.wv[k] for k in cutWords if k in word2vec_model]
    contentVector = np.array(vector_list).mean(axis=0)
    return contentVector

xtrain_w2v  = [get_contentVector(x,model) for x in tqdm(train_text)]
xvalid_w2v  = [get_contentVector(x,model) for x in tqdm(test_text)]

xtrain_w2v  = np.array(xtrain_w2v)
xvalid_w2v  = np.array(xvalid_w2v)

这一步之后，继续使用模型进行分类。

#LGB
clf = lgb.LGBMClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1, silent=False)
clf.fit(xtrain_w2v, y)
predictions = clf.predict_proba(xvalid_w2v)

print ("accuracy: %0.6f " % accuracy(predictions.argmax(axis=-1),np.array(ans_final['label'])))
#逻辑回归
clf = LogisticRegression(C=1.0,solver='lbfgs',multi_class='multinomial')
clf.fit(xtrain_w2v, y)
predictions = clf.predict_proba(xvalid_w2v)

print ("accuracy: %0.6f " % accuracy(predictions.argmax(axis=-1),np.array(ans_final['label'])))
#SVM
clf = SVC(C=1.0, probability=True) # since we need probabilities
clf.fit(xtrain_w2v, y)
predictions = clf.predict_proba(xvalid_w2v)

print ("accuracy: %0.6f " % accuracy(predictions.argmax(axis=-1),np.array(ans_final['label'])))