放上代码例子:
# -*-coding:utf-8 -*-
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import time, datetime
import statsmodels.api as sm
df = pd.read_csv('D:\work\数据分析\换师次数学生状态关系\全部学员课消状态表.csv',encoding='gbk')
# data = df.ix[:, ['student_id','累计课消','学生状态']]
data=list(df['累计课消'])
data0 = df[df['学生状态'] == 0]
data1 = df[(df['学生状态'] == 1)]
data2 = df[df['学生状态'] == 2]
data3 = df[df['学生状态'] == 3]
data4 = df[df['学生状态'] == 4]
data_0 = list(data0['累计课消'])
data_1 = list(data1['累计课消'])
data_2 = list(data2['累计课消'])
data_3 = list(data3['累计课消'])
data_4 = list(data4['累计课消'])
list_bin=list(np.arange(3,255,3))
def count_y(object):
list_count = [0 for i in range(84)]
out_range_count=[]
for i in list_bin:
for k in object:
if k < i and k >= i-3:
bin_index=list_bin.index(i)
list_count[bin_index] += 1
for k in object:
if k >= 252:
list_count[-1] += 1
list_array = np.array(list_count)
return list_array
y = count_y(data)
refund=pd.Series(y_5/y)
class_fire=pd.Series(list_bin)
X = sm.add_constant(class_fire)
est=sm.OLS(refund,X).fit()
est.summary()
输出:
左边:(模型描述)
Dep.Variable: 输出变量的名称
Model :模型名称
Method: 方法 其中 Least Squares 表示最小二乘法
Date: 日期
Time: 时间
No.Observations: 样本数目
Df Residuals : 残差*度 (观测数No.Observations - (参数数目Df Model+1常数))
–残差代表的是实际观察值与估计值的差
Df Model: 模型参数个数,相当于输入的X的元素个数
右边:(模型质量描述)(需要参考判断的数据)
R- squared : 可决系数,用来判断估计的准确性,范围在 [0,1] 约接近1 ,说明对y的解释能力越强,拟合越好