逻辑回归算法

Selier 2020-04-11

1、准备数据

0 1:59 2:2 3:43.4 4:2 5:1
0 1:36 2:1 3:57.2 4:1 5:1
0 1:61 2:2 3:190 4:2 5:1
1 1:58 2:3 3:128 4:4 5:3
1 1:55 2:3 3:80 4:3 5:4
0 1:61 2:1 3:94.4 4:2
0 1:38 2:1 3:76 4:1 5:1
0 1:42 2:1 3:240 4:3 5:2
0 1:50 2:1 3:74 4:1 5:1
0 1:58 2:3 3:68.6 4:2 5:2
0 1:68 2:3 3:132.8 4:4 5:2
1 1:25 2:2 3:94.6 4:4 5:3
0 1:52 2:1 3:56 4:1 5:1
0 1:31 2:1 3:47.8 4:2 5:1
1 1:36 2:3 3:31.6 4:3 5:1
0 1:42 2:1 3:66.2 4:2 5:1
1 1:14 2:3 3:138.6 4:3 5:3
0 1:32 2:1 3:114 4:2 5:3
0 1:35 2:1 3:40.2 4:2 5:1
1 1:70 2:3 3:177.2 4:4 5:3
1 1:65 2:2 3:51.6 4:4 5:4
0 1:45 2:2 3:124 4:2 5:4
1 1:68 2:3 3:127.2 4:3 5:3
0 1:31 2:2 3:124.8 4:2 5:3

2、python算法

from sklearn import datasets

# 读取文件
data = datasets.load_svmlight_file("../../wa.txt")
x = data[0]
y = data[1]
#   切分数据
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test =train_test_split(x,y,  test_size=0.33, random_state=0)
#   标准化数据
from sklearn.preprocessing import StandardScaler
sc = StandardScaler(copy=False, with_mean=False, with_std=True)
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
#   训练模型
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, Y_train)
#   测试数据
Y_pred = classifier.predict(X_test)

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
cm = confusion_matrix(Y_test, Y_pred)
print(cm)  # print confusion_matrix
print(classification_report(Y_test, Y_pred))   # print classification report

3、spark算法

package com.sunbin

import org.apache.spark.ml.classification.LogisticRegression
import org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.mllib.util.MLUtils
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{SQLContext, SparkSession}
import org.apache.log4j.{ Level, Logger }


/**
  * 使用逻辑回归做胃癌  转移  分类
  */
object LogisticRegressionTest {
  def main(args: Array[String]): Unit = {
    Logger.getRootLogger.setLevel(Level.WARN)
    val conf = new SparkConf().setMaster("local").setAppName("logistic")
    val sc = SparkSession.builder().config(conf).getOrCreate()

    /**
      * 标注点LabeledPoint是一种带有标签(Label/Response)的本地向量,它可以是稠密或者是稀疏的。
      * 在MLlib中,标注点在监督学习算法中被使用。由于标签是用双精度浮点型来存储的,故标注点类型在回归
      * (Regression)和分类(Classification)问题上均可使用。例如,对于二分类问题,则正样本的标签为1,
      * 负样本的标签为0,而对于多类别的分类问题来说,标签则应是一个以0开始的索引序列:0, 1, 2 ...
      */
    val data:RDD[LabeledPoint] = MLUtils.loadLibSVMFile(sc.sparkContext,"wa.txt")
    //测试集和训练集并不一定按2:8的比例分
    val splitData =  data.randomSplit(Array(0.8,0.2),2L)
    val training=splitData(0).cache()
    val test=splitData(1)
    //建立LogisticRegressionWithLBFGS对象,设置分类数 3 ,run传入训练集开始训练,返回训练后的模型
    val model = new LogisticRegressionWithLBFGS().setNumClasses(2).run(training)
    //使用训练后的模型对测试集进行测试,同时打印标签和测试结果
    val predictionAndLabels = test.map { case LabeledPoint(label, features) =>
      val prediction = model.predict(features)
      (prediction, label)
    } 
    predictionAndLabels.foreach(println)
  }
}

相关推荐