基本步骤: 1、训练素材分类: 我是参考官方的目录结构: 每个目录中放对应的文本,一个 txt 文件一篇对应的文章:就像下面这样 需要注意的是所有素材比例请保持在相同的比例(根据训练结果酌情调整、不可比例过于悬殊、容易造成过拟合(通俗点就是大部分文章都给你分到素材最多的那个类别去了)) 废话不多说直接上代码吧(测试代码的丑得一逼;将就着看看吧) 需要一个小工具: pip install chinese-tokenizer 这是训练器:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
import reimport jiebaimport jsonfrom io import BytesIOfrom chinese_tokenizer.tokenizer import Tokenizerfrom sklearn.datasets import load_filesfrom sklearn.feature_extraction.text import CountVectorizer, TfidfTransformerfrom sklearn.model_selection import train_test_splitfrom sklearn.naive_bayes import MultinomialNBfrom sklearn.externals import joblibjie_ba_tokenizer = Tokenizer().jie_ba_tokenizer training_data = load_files('./data' , encoding='utf-8' ) x_train, _, y_train, _ = train_test_split(training_data.data, training_data.target) print('开始建模.....' ) with open('training_data.target' , 'w' , encoding='utf-8' ) as f: f.write(json.dumps(training_data.target_names)) count_vect = CountVectorizer(tokenizer=jieba_tokenizer) tfidf_transformer = TfidfTransformer() X_train_counts = count_vect.fit_transform(x_train) X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) print('正在训练分类器.....' ) clf = MultinomialNB().fit(X_train_tfidf, y_train) joblib.dump(clf, 'model.pkl' ) joblib.dump(count_vect, 'count_vect' ) print("分类器的相关信息:" ) print(clf)
下面是是使用训练好的分类器分类文章: 需要分类的文章放在 predict_data 目录中:照样是一篇文章一个 txt 文件
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
# -*- coding: utf-8 -*- # @Time : 2017 /8 /23 18 :02 # @Author : 哎哟卧槽 # @Site : # @File : 贝叶斯分类器.py # @Software: PyCharm import reimport jiebaimport json from sklearn.datasets import load_filesfrom sklearn.feature_extraction.text import CountVectorizer, TfidfTransformerfrom sklearn.externals import joblib# 加载分类器 clf = joblib.load ('model.pkl' ) count_vect = joblib.load ('count_vect' ) testing_data = load_files('./predict_data' , encoding='utf-8' ) target_names = json .loads(open ('training_data.target' , 'r' , encoding='utf-8' ).read ()) # # 字符串处理 tfidf_transformer = TfidfTransformer() X_new_counts = count_vect.transform (testing_data.data) X_new_tfidf = tfidf_transformer.fit_transform(X_new_counts) # 进行预测 predicted = clf.predict(X_new_tfidf) for title, category in zip(testing_data.filenames, predicted): print('%r => %s' % (title, target_names[category]))
这个样子将训练好的分类器在新的程序中使用时候 就不报错: ValueError dimension mismatch·· 这儿有个 demo 仅供参考:GitHub 地址