[特征工程]常用功能实现

前言：特征工程是机器学习流程中的重要步骤，是数据预处理阶段的主要内容。

本文汇总了特征工程最常用功能的具体实现方式，方便快速查询使用。(我不会编写代码，我只是代码的复制粘贴工)

1.特征归一化处理：零均值归一化、线性函数归一化、二值化

 1 import numpy as np
 2 import pandas as pd
 3 import matplotlib as mpl
 4 import sklearn
 5 
 6 from sklearn import preprocessing
 7 
 8 X_train = np.array([[1.,-1.,2.],[2.,0.,0.],[0.,1.,-1.]])
 9 
10 #1.使用scale，进行“零均值归一化”
11 X_scaled = preprocessing.scale(X_train)
12 
13 #2.使用StandardScaler，进行“零均值归一化”
14 X_scaled = preprocessing.StandardScaler().fit_transform(X_train)
15 
16 #3.使用MinMaxScaler，进行“线性函数归一化，0～+1区间”
17 X_scaled = preprocessing.MinMaxScaler(feature_range=(0,1)).fit_transform(X_train)
18 
19 #4.对数值特征进行“0-1二值化”，大于threshold的都变成1，小于等于threshold的都变成0
20 X_scaled = preprocessing.Binarizer(threshold=0.0).fit_transform(X_train)

2.类别型特征处理：序号编码、独热编码、标签二值化编码

 1 import numpy as np
 2 import pandas as pd
 3 import matplotlib as mpl
 4 import sklearn
 5 
 6 from sklearn import preprocessing
 7 
 8 X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
 9 
10 #1.使用OrdinalEncoder进行“序号编码”
11 enc = preprocessing.OrdinalEncoder().fit(X)
12 Y = [['female', 'from US', 'uses Safari']]
13 result = enc.transform(Y)
14 
15 #2.使用OneHotEncoder进行“独热编码”
16 enc = preprocessing.OneHotEncoder().fit(X)
17 Y = [['female', 'from US', 'uses Safari'],['male', 'from Europe', 'uses Safari']]
18 result = enc.transform(Y)
19 
20 #3.LabelBinarizer
21 enc = preprocessing.LabelBinarizer().fit(np.array([[0, 1, 1], [1, 0, 0]]))
22 Y = [0, 1, 2, 1]
23 result = enc.transform(Y)

3.高维组合特征处理：矩阵特征值分解、矩阵奇异值分解

 1 import numpy as np
 2 import pandas as pd
 3 import matplotlib as mpl
 4 import sklearn
 5 
 6 from sklearn.decomposition import TruncatedSVD
 7 from sklearn.random_projection import sparse_random_matrix
 8 
 9 ##1. numpy实现矩阵分解——特征值分解
10 from numpy.linalg import eig
11 #生成随机值矩阵 (4,4)大小
12 A = np.random.randint(-10,10,(4,4))
13 #矩阵与其转置相乘，得到对称正定矩阵
14 C = np.dot(A.T, A)
15 #矩阵分解
16 vals, vecs = eig(C)
17 ##特征值
18 print('矩阵分解的特征值为：')
19 print(vals)
20 ##特征向量
21 #print(vecs)
22 print()
23 
24 ##2. numpy实现矩阵分解——奇异值分解
25 from numpy.linalg import svd
26 B = np.random.randint(-10,10,(4, 3)).astype(float)
27 #SVD分解
28 U,Sigma,V_trans = np.linalg.svd(B)
29 #print(U)
30 #奇异值组成的对角矩阵
31 print('奇异值分解的奇异值为：')
32 print(Sigma)
33 #print(V_trans)
34 print()
35 
36 ##3. sk-learn中的TruncatedSVD
37 ##生成一个稀疏矩阵
38 X = sparse_random_matrix(100, 100, density=0.01, random_state=42)
39 ##<class 'scipy.sparse.csr.csr_matrix'>
40 #print(type(X))
41 ##<class 'numpy.ndarray'>
42 #print(type(X.A))
43 ##存储到文件中
44 #np.savetxt('X.txt',X.A)
45 ##对于稀疏矩阵，使用TruncatedSVD进行降维，提取出低维的有效特征
46 ##n_components:期望的输出数据维数。必须严格小于功能的数量。默认值为2，对于可视化非常有用。对于LSA，建议值为100。
47 svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)
48 svd.fit(X)
49 #print(svd.explained_variance_)
50 #print(svd.explained_variance_ratio_)
51 ##奇异值
52 print('使用learn中的TruncatedSVD得到最大的n个的奇异值：')
53 print(svd.singular_values_)
54 print()

4.特征选择：基于L1正则化的特征选择，基于树的特征选择

 1 import numpy as np
 2 import pandas as pd
 3 import matplotlib as mpl
 4 import sklearn
 5 
 6 ##1. 使用L1正则化进行特征选择
 7 from sklearn.svm import LinearSVC
 8 from sklearn.datasets import load_iris
 9 from sklearn.feature_selection import SelectFromModel
10 iris = load_iris()
11 X, y = iris.data, iris.target
12 print(X.shape)
13 
14 lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(X, y)
15 model = SelectFromModel(lsvc, prefit=True)
16 X_new = model.transform(X)
17 print(X_new.shape)
18 print()
19 
20 
21 ##2. 使用基于树的特征选择
22 from sklearn.ensemble import ExtraTreesClassifier
23 from sklearn.datasets import load_iris
24 from sklearn.feature_selection import SelectFromModel
25 iris = load_iris()
26 X, y = iris.data, iris.target
27 print(X.shape)
28 
29 clf = ExtraTreesClassifier(n_estimators=50)
30 clf = clf.fit(X, y)
31 print(clf.feature_importances_)
32 
33 model = SelectFromModel(clf, prefit=True)
34 X_new = model.transform(X)
35 print(X_new.shape)

5.词袋模型(BOW)：词频模型，TF-IDF模型，hash编码模型

 1 import numpy as np
 2 import pandas as pd
 3 import matplotlib as mpl
 4 import sklearn
 5 
 6 ##1. 词袋模型——词频
 7 from sklearn.feature_extraction.text import CountVectorizer
 8 text = ["The quick brown fox jumped over the lazy dog."]
 9 # 创建transform
10 vectorizer = CountVectorizer()
11 ## 使用 N-gram模型拆分
12 #vectorizer = CountVectorizer(ngram_range=(2, 2), decode_error="ignore",token_pattern = r'w+',min_df=1)
13 # 分词并建立词汇表
14 vectorizer.fit(text)
15 # 结果输出
16 print(vectorizer.vocabulary_)
17 vector = vectorizer.transform(text)
18 # 输出编码后的向量信息
19 print(vector.shape)
20 print(type(vector))
21 print(vector.toarray())
22 print()
23 
24 ##2. 词袋模型——词频-逆文档频率模型
25 from sklearn.feature_extraction.text import TfidfVectorizer
26 text = ["The quick brown fox jumped over the lazy dog.","The dog.","The fox"]
27 vectorizer = TfidfVectorizer()
28 vectorizer.fit(text)
29 print(vectorizer.vocabulary_)
30 #逆文档频率：
31 print(vectorizer.idf_)
32 vector = vectorizer.transform([text[0]])
33 print(vector.shape)
34 #文档0的tf-idf得分(0～1之间):
35 print(vector.toarray())
36 print()
37 
38 ##3. 词袋模型——hash向量编码：
39 from sklearn.feature_extraction.text import HashingVectorizer
40 # 文件列表
41 text = ["The quick brown fox jumped over the lazy dog."]
42 # 创建transform
43 vectorizer = HashingVectorizer(n_features=20)
44 # 编码文件
45 vector = vectorizer.transform(text)
46 # 输出编码后的结果
47 print(vector.shape)
48 print(vector.toarray())

6.Word2Vec词嵌入：gensim版

 1 import numpy as np
 2 import pandas as pd
 3 import matplotlib as mpl
 4 import gensim
 5 
 6 texts = [['human', 'interface', 'computer'],
 7 ['survey', 'user', 'computer', 'system', 'response', 'time'],
 8 ['eps', 'user', 'interface', 'system'],
 9 ['system', 'human', 'system', 'eps'],
10 ['user', 'response', 'time'],
11 ['trees'],
12 ['graph', 'trees'],
13 ['graph', 'minors', 'trees'],
14 ['graph', 'minors', 'survey']]
15 
16 # dictionary = gensim.corpora.Dictionary(texts)
17 # corpus = [dictionary.doc2bow(text) for text in texts]
18 # print(corpus[0]) # [(0, 1), (1, 1), (2, 1)]
19 # tfidf = models.TfidfModel(corpus)
20 
21 # #参数说明：
22 # (1) sentences: 我们要分析的语料，可以是一个列表，或者从文件中遍历读出。对于大语料集，建议使用BrownCorpus,Text8Corpus或lineSentence构建。
23 # (2) size: 词向量的维度，默认值是100。这个维度的取值一般与我们的语料的大小相关，视语料库的大小而定。
24 # (3) alpha： 是初始的学习速率，在训练过程中会线性地递减到min_alpha。
25 # (4) window：即词向量上下文最大距离，skip-gram和cbow算法是基于滑动窗口来做预测。默认值为5。在实际使用中，可以根据实际的需求来动态调整这个window的大小。对于一般的语料这个值推荐在[5,10]之间。
26 # (5) min_count:：可以对字典做截断. 词频少于min_count次数的单词会被丢弃掉, 默认值为5。
27 # (6) max_vocab_size: 设置词向量构建期间的RAM限制，设置成None则没有限制。
28 # (7) sample: 高频词汇的随机降采样的配置阈值，默认为1e-3，范围是(0,1e-5)。
29 # (8) seed：用于随机数发生器。与初始化词向量有关。
30 # (9) workers：用于控制训练的并行数。
31 # (10) min_alpha: 由于算法支持在迭代的过程中逐渐减小步长，min_alpha给出了最小的迭代步长值。随机梯度下降中每    轮的迭代步长可以由iter，alpha， min_alpha一起得出。对于大语料，需要对alpha, min_alpha,iter一起调参，来选                        择合适的三个值。
32 # (11) sg: 即我们的word2vec两个模型的选择了。如果是0， 则是CBOW模型，是1则是Skip-Gram模型，默认是0即CBOW模型。
33 # (12)hs: 即我们的word2vec两个解法的选择了，如果是0， 则是Negative Sampling，是1的话并且负采样个数negative大于0， 则是Hierarchical Softmax。默认是0即Negative Sampling。
34 # (13) negative:如果大于零，则会采用negativesampling，用于设置多少个noise words（一般是5-20）。
35 # (14) cbow_mean: 仅用于CBOW在做投影的时候，为0，则采用上下文的词向量之和，为1则为上下文的词向量的平均值。默认值也是1,不推荐修改默认值。
36 # (15) hashfxn： hash函数来初始化权重，默认使用python的hash函数。
37 # (16) iter: 随机梯度下降法中迭代的最大次数，默认是5。对于大语料，可以增大这个值。
38 # (17) trim_rule： 用于设置词汇表的整理规则，指定那些单词要留下，哪些要被删除。可以设置为None（min_count会被使用）。
39 # (18) sorted_vocab： 如果为1（默认），则在分配word index 的时候会先对单词基于频率降序排序。
40 # (19) batch_words：每一批的传递给线程的单词的数量，默认为10000。
41 
42 model = gensim.models.word2vec.Word2Vec(texts, size=5, hs=1, min_count=1, window=3)
43 
44 #1. 计算两个词向量的相似度
45 sim1 = model.similarity(u'human', u'trees')
46 sim2 = model.similarity(u'human', u'computer')
47 print(sim1,sim2)
48 print()
49 
50 #2. 与某个词（human）最相近的5个词
51 for key in model.similar_by_word(u'human', topn=5):
52     print(key)
53 print()
54 
55 #3. 计算某个词(human)的相关列表
56 sim3 = model.most_similar(u'human', topn=5)
57 for key in sim3:
58     print(key)
59 print()
60 
61 #4. 找出与众不同的词
62 sim4 = model.doesnt_match(['human','interface','time'])
63 print(u'不同匹配的词是', sim4)