clouderyu 2020-07-17
在掌握基于Weka工具的数据挖掘(分类、回归、聚类、关联规则分析)应用的基础上,实现基于Weka API的数据挖掘程序设计。
1.下载安装JDK 7.0 64位版,Weka 3.7版,Eclipse IDE for Java Developers 4.0以上版本。
2.基于Weka API的数据分类。
3.基于Weka API的数据回归。
4.基于Weka API的数据聚类。
5.基于Weka API的关联规则分析。
(1)JDK与Weka的安装方法与实验1中相同。
(2)从http://www.eclipse.org/home/index.php 下载并安装Eclipse。
(3)在Eclipse中建立一个新的Java工程,用于放置实验程序的源代码。
(4)编程请遵循Java编程规范。规范中文版参见:
http://www.hawstein.com/posts/google-java-style.html。
(a)将数值型字段规范化至[0,1]区间。
(b)调用特征选择算法(Select attributes),选择关键特征。
运行结果如下:
代码如下:
package weka; import java.io.BufferedReader; import java.io.FileReader; import java.util.Random; import weka.attributeSelection.AttributeSelection; import weka.attributeSelection.BestFirst; import weka.attributeSelection.CfsSubsetEval; import weka.classifiers.bayes.NaiveBayes; import weka.classifiers.evaluation.Evaluation; import weka.classifiers.functions.MultilayerPerceptron; import weka.classifiers.trees.J48; import weka.classifiers.trees.RandomForest; import weka.core.Attribute; import weka.core.Instances; import weka.core.Utils; import weka.filters.Filter; import weka.filters.unsupervised.attribute.Discretize; import weka.filters.unsupervised.attribute.Normalize; import weka.filters.unsupervised.attribute.NumericToNominal; import weka.filters.unsupervised.attribute.Remove; /** * 二、基于Weka API的数据分类 * @author 西瓜不甜 * */ public class Test_1 { public static void main(String[] args) throws Exception { /** * 1.读取"电费回收数据.csv" */ Instances data = new Instances(new BufferedReader(new FileReader("C:\\Users\\西瓜不甜\\Desktop\\workspace\\weka\\电费回收数据.arff"))); /** * 2.数据预处理 * (1)删除无用属性 */ String[] options = new String[2]; options[0] = "-R"; // remove options[1] = "1-2,7,10,12"; // "CONS_NO", "YMD", "RCVED_DATE", "CUISHOU_COUNT", "YM" Remove remove = new Remove(); remove.setOptions(options); remove.setInputFormat(data); data = Filter.useFilter(data, remove); // apply filter data.setClassIndex(data.numAttributes() - 1); // set class /** * 2.数据预处理 * (2)归一化 */ Normalize norm = new Normalize(); norm.setInputFormat(data); data = Filter.useFilter(data, norm); /** * 2.数据预处理 * (3)特征选择 */ AttributeSelection attSelect = new AttributeSelection(); CfsSubsetEval eval = new CfsSubsetEval(); BestFirst search = new BestFirst(); attSelect.setEvaluator(eval); attSelect.setSearch(search); attSelect.SelectAttributes(data); int[] indices = attSelect.selectedAttributes(); System.out.print("\n Select Attributes : starting with (0): " + Utils.arrayToString(indices) + ‘\t‘); for (int i = 0; i <indices.length; i++) { int index = indices[i]; Attribute attribute = data.attribute(index); System.out.print(attribute.name() + " "); } System.out.println(); /** * 2.数据预处理 * (4) 离散化 */ Discretize disc = new Discretize(); disc.setInputFormat(data); data = Filter.useFilter(data, disc); /** * 2.数据预处理 * (5) 类型转换 */ NumericToNominal nutono = new NumericToNominal(); nutono.setInputFormat(data); data = Filter.useFilter(data, nutono); /** * 3.数据分类 * (0) 取60%为训练集,40%为测试集 */ data.randomize(new Random(0)); int trainSize = (int) Math.round(data.numInstances() * 0.60); int testSize = data.numInstances() - trainSize; Instances train = new Instances(data, 0, trainSize); Instances test = new Instances(data, trainSize, testSize); /** * 3.数据分类 * (1)决策树(J48) */ J48 j48 = new J48(); // train classifier j48.buildClassifier(train); Evaluation evaluation; evaluation = new Evaluation(train); // evaluate classifier and print some statistics evaluation.evaluateModel(j48, test); System.out.println(evaluation.toSummaryString("\n======J48 : Results======\n", false)); System.out.println(evaluation.toClassDetailsString()); System.out.println(evaluation.toMatrixString()); /** * 3.数据分类 * (2)随机森林(RandomForest) */ RandomForest rf = new RandomForest(); // train classifier rf.buildClassifier(train); evaluation = new Evaluation(train); // evaluate classifier and print some statistics evaluation.evaluateModel(rf, test); System.out.println(evaluation.toSummaryString("\n======RandomForest : Results======\n", false)); System.out.println(evaluation.toClassDetailsString()); System.out.println(evaluation.toMatrixString()); /** * 3.数据分类 * (3)神经网络(MultilayerPerceptron) * ps:运行时间约4h,谨慎运行。 */ MultilayerPerceptron mp = new MultilayerPerceptron(); // train classifier mp.buildClassifier(train); evaluation = new Evaluation(train); // evaluate classifier and print some statistics evaluation.evaluateModel(mp, test); System.out.println(evaluation.toSummaryString("\n======MultilayerPerceptron : Results======\n", false)); System.out.println(evaluation.toClassDetailsString()); System.out.println(evaluation.toMatrixString()); /** * 3.数据分类 * (4)朴素贝叶斯(NaiveBayes) */ NaiveBayes nb = new NaiveBayes(); // train classifier nb.buildClassifier(train); evaluation = new Evaluation(train); // evaluate classifier and print some statistics evaluation.evaluateModel(nb, test); System.out.println(evaluation.toSummaryString("\n======NaiveBayes : Results======\n", false)); System.out.println(evaluation.toClassDetailsString()); System.out.println(evaluation.toMatrixString()); } }
(a)将数值型字段规范化至[0,1]区间。
(b)调用特征选择算法(Select attributes),选择关键特征。
运行结果如下: