Selier 2020-04-11
1、准备数据
0 1:59 2:2 3:43.4 4:2 5:1 0 1:36 2:1 3:57.2 4:1 5:1 0 1:61 2:2 3:190 4:2 5:1 1 1:58 2:3 3:128 4:4 5:3 1 1:55 2:3 3:80 4:3 5:4 0 1:61 2:1 3:94.4 4:2 0 1:38 2:1 3:76 4:1 5:1 0 1:42 2:1 3:240 4:3 5:2 0 1:50 2:1 3:74 4:1 5:1 0 1:58 2:3 3:68.6 4:2 5:2 0 1:68 2:3 3:132.8 4:4 5:2 1 1:25 2:2 3:94.6 4:4 5:3 0 1:52 2:1 3:56 4:1 5:1 0 1:31 2:1 3:47.8 4:2 5:1 1 1:36 2:3 3:31.6 4:3 5:1 0 1:42 2:1 3:66.2 4:2 5:1 1 1:14 2:3 3:138.6 4:3 5:3 0 1:32 2:1 3:114 4:2 5:3 0 1:35 2:1 3:40.2 4:2 5:1 1 1:70 2:3 3:177.2 4:4 5:3 1 1:65 2:2 3:51.6 4:4 5:4 0 1:45 2:2 3:124 4:2 5:4 1 1:68 2:3 3:127.2 4:3 5:3 0 1:31 2:2 3:124.8 4:2 5:3
2、python算法
from sklearn import datasets # 读取文件 data = datasets.load_svmlight_file("../../wa.txt") x = data[0] y = data[1] # 切分数据 from sklearn.model_selection import train_test_split X_train, X_test, Y_train, Y_test =train_test_split(x,y, test_size=0.33, random_state=0) # 标准化数据 from sklearn.preprocessing import StandardScaler sc = StandardScaler(copy=False, with_mean=False, with_std=True) X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) # 训练模型 from sklearn.linear_model import LogisticRegression classifier = LogisticRegression() classifier.fit(X_train, Y_train) # 测试数据 Y_pred = classifier.predict(X_test) from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report cm = confusion_matrix(Y_test, Y_pred) print(cm) # print confusion_matrix print(classification_report(Y_test, Y_pred)) # print classification report
3、spark算法
package com.sunbin import org.apache.spark.ml.classification.LogisticRegression import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.util.MLUtils import org.apache.spark.rdd.RDD import org.apache.spark.sql.{SQLContext, SparkSession} import org.apache.log4j.{ Level, Logger } /** * 使用逻辑回归做胃癌 转移 分类 */ object LogisticRegressionTest { def main(args: Array[String]): Unit = { Logger.getRootLogger.setLevel(Level.WARN) val conf = new SparkConf().setMaster("local").setAppName("logistic") val sc = SparkSession.builder().config(conf).getOrCreate() /** * 标注点LabeledPoint是一种带有标签(Label/Response)的本地向量,它可以是稠密或者是稀疏的。 * 在MLlib中,标注点在监督学习算法中被使用。由于标签是用双精度浮点型来存储的,故标注点类型在回归 * (Regression)和分类(Classification)问题上均可使用。例如,对于二分类问题,则正样本的标签为1, * 负样本的标签为0,而对于多类别的分类问题来说,标签则应是一个以0开始的索引序列:0, 1, 2 ... */ val data:RDD[LabeledPoint] = MLUtils.loadLibSVMFile(sc.sparkContext,"wa.txt") //测试集和训练集并不一定按2:8的比例分 val splitData = data.randomSplit(Array(0.8,0.2),2L) val training=splitData(0).cache() val test=splitData(1) //建立LogisticRegressionWithLBFGS对象,设置分类数 3 ,run传入训练集开始训练,返回训练后的模型 val model = new LogisticRegressionWithLBFGS().setNumClasses(2).run(training) //使用训练后的模型对测试集进行测试,同时打印标签和测试结果 val predictionAndLabels = test.map { case LabeledPoint(label, features) => val prediction = model.predict(features) (prediction, label) } predictionAndLabels.foreach(println) } }
theta = np.zeros #theta = array,构造全为零的行向量。grad[0,j] = np.sum/len #∑term / m. return value > threshol