Fork me on GitHub

数模之插值与拟合

插值与拟合是一种通过数据点总结出一般性规律的过程,线性回归就是拟合思想的一个最直观的体现。但是插值和拟合二者是不一样的。所谓的插值是找出一条经过所有点的曲线,然后得出曲线方程。而拟合不要求曲线经过所有的点,而是通过最小二乘法这样的方式找到所有点到直线的最短距离,来大致的拟合数据。二者的侧重点稍有不一样。这篇文章是对python编程实现插值和拟合的一个小结。

在python这块做插值和拟合大致有以下一些可以使用的第三方库,包括插值,多元,多项式拟合等各种领域。

  • numpy 自带的线性拟合,使用最小二乘法实现的可以针对多项式拟合
  • scipy.optimize 模块自定义函数拟合
  • sklearn 实现单变量和多元拟合
  • scipy.interpolate实现多维插值
  • statsmodels实现单变量与多元拟合

numpy 实现自定义多项式拟合

1
2
3
4
5
6
7
8
import numpy as np

X=[ 1 ,2 ,3 ,4 ,5 ,6]
Y=[ 2.5 ,3.51 ,4.45 ,5.52 ,6.47 ,7.51]
z1 = np.polyfit(X, Y, 1) #一次多项式拟合,相当于线性拟合
p1 = np.poly1d(z1)
print (z1) #[ 1. 1.49333333]
print (p1) # 1 x + 1.493
[1.         1.49333333]

1 x + 1.493
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
import numpy
from matplotlib import pyplot as plt


def polyfit(x, y, degree):
results = {}
coeffs = numpy.polyfit(x, y, degree)#多项式拟合,degree表示项数
print(coeffs)
results['polynomial'] = coeffs.tolist()

# r-squared
p = numpy.poly1d(coeffs) #将结果转换成函数
# fit values, and mean
yhat = p(x) # or [p(z) for z in x]
ybar = numpy.sum(y)/len(y) # or sum(y)/len(y)
ssreg = numpy.sum((yhat-ybar)**2) # or sum([ (yihat - ybar)**2 for yihat in yhat])
sstot = numpy.sum((y - ybar)**2) # or sum([ (yi - ybar)**2 for yi in y])
results['determination'] = ssreg / sstot #准确率
print(results)
plt.plot(x,p(x),'r',label='fit')
plt.scatter(x,y)
plt.show()

x=[ 1 ,2 ,3 ,4 ,5 ,6]
y=[ 2.5 ,3.51 ,4.45 ,5.52 ,6.47 ,7.2]
z1 = polyfit(x, y, 2)
[-0.02428571  1.12571429  1.37      ]
{'polynomial': [-0.024285714285714497, 1.1257142857142874, 1.3699999999999957], 'determination': 0.9989301416685602}

png

scipy.optimize 模块自定义函数拟合

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25

import numpy as np
from matplotlib import pyplot as plt
from scipy.optimize import curve_fit

def f(x):
return 2*np.sin(x)+3
def f_fit(x,a,b):
return a*np.sin(x)+b
def f_show(x,p_fit):
a,b=p_fit.tolist()
return a*np.sin(x)+b
x=np.linspace(-2*np.pi,2*np.pi)
y=f(x)+0.5*np.random.randn(len(x))#加入了噪音
p_fit,pcov=curve_fit(f_fit,x,y)#曲线拟合
print(p_fit)#最优参数
print(pcov)#最优参数的协方差估计矩阵
y1=f_show(x,p_fit)
plt.plot(x,f(x),'r',label='original')
plt.scatter(x,y,c='g',label='before_fitting')#散点图
plt.plot(x,y1,'b--',label='fitting')
plt.xlabel('x')
plt.ylabel('y')
plt.legend()
plt.show()
[2.0287684  2.99863621]
[[1.04777038e-02 7.43259080e-12]
 [7.43259080e-12 5.13407483e-03]]

png

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import numpy as np
from scipy.optimize import leastsq # 从scipy库的optimize模块引入leastsq函数
import pylab as pl # 引入绘图模块pylab,并重命名为pl

def func(x, p):
"""
数据拟合所用的函数: A*sin(2*pi*k*x + theta)
"""
A, k, theta = p
return A*np.sin(2*np.pi*k*x+theta)

def residuals(p, y, x):
"""
实验数据x, y和拟合函数之间的差,p为拟合需要找到的系数
"""
return y - func(x, p)

x = np.linspace(0, -2*np.pi, 100)
A, k, theta = 10, 0.34, np.pi/6 # 真实数据的函数参数
y0 = func(x, [A, k, theta]) # 真实数据
y1 = y0 + 2 * np.random.randn(len(x)) # 加入噪声之后的实验数据,噪声是服从标准正态分布的随机量

p0 = [7, 0.2, 0] # 第一次猜测的函数拟合参数

# 调用leastsq进行数据拟合
# residuals为计算误差的函数
# p0为拟合参数的初始值
# args为需要拟合的实验数据
plsq = leastsq(residuals, p0, args=(y1, x))


print ("actual parameter:", [A, k, theta]) # 真实参数
print ("fitting parameter", plsq[0]) # 实验数据拟合后的参数

pl.plot(x, y0, label="actual data") # 绘制真实数据
pl.plot(x, y1, label="experimental data with noise") # 带噪声的实验数据
pl.plot(x, func(x, plsq[0]), label="fitting data") # 拟合数据
pl.legend()
pl.show()
actual parameter: [10, 0.34, 0.5235987755982988]
fitting parameter [10.09077955  0.3434116  -5.6891585 ]

png

scipy.interpolate实现多维插值

一维

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21

import numpy as np
from scipy import interpolate
import pylab as pl

x=np.linspace(0,10,11)
#x=[ 0. 1. 2. 3. 4. 5. 6. 7. 8. 9. 10.]
y=np.sin(x)
xnew=np.linspace(0,10,101)
pl.plot(x,y,"ro")

for kind in ["nearest","zero","slinear","quadratic","cubic"]:#插值方式
#"nearest","zero"为阶梯插值
#slinear 线性插值
#"quadratic","cubic" 为2阶、3阶B样条曲线插值
f=interpolate.interp1d(x,y,kind=kind)
# ‘slinear’, ‘quadratic’ and ‘cubic’ refer to a spline interpolation of first, second or third order)
ynew=f(xnew)
pl.plot(xnew,ynew,label=str(kind))
pl.legend(loc="lower right")
pl.show()

png

二维

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
import numpy as np
from scipy import interpolate
import pylab as pl
import matplotlib as mpl

def func(x, y):
return (x+y)*np.exp(-5.0*(x**2 + y**2))

# X-Y轴分为15*15的网格
y,x= np.mgrid[-1:1:15j, -1:1:15j]

fvals = func(x,y) # 计算每个网格点上的函数值 15*15的值
print (len(fvals[0]))

#三次样条二维插值
newfunc = interpolate.interp2d(x, y, fvals, kind='cubic')

# 计算100*100的网格上的插值
xnew = np.linspace(-1,1,100)#x
ynew = np.linspace(-1,1,100)#y
fnew = newfunc(xnew, ynew)#仅仅是y值 100*100的值

# 绘图
# 为了更明显地比较插值前后的区别,使用关键字参数interpolation='nearest'
# 关闭imshow()内置的插值运算。
pl.subplot(121)
im1=pl.imshow(fvals, extent=[-1,1,-1,1], cmap=mpl.cm.hot, interpolation='nearest', origin="lower")#pl.cm.jet
#extent=[-1,1,-1,1]为x,y范围 favals为
pl.colorbar(im1)

pl.subplot(122)
im2=pl.imshow(fnew, extent=[-1,1,-1,1], cmap=mpl.cm.hot, interpolation='nearest', origin="lower")
pl.colorbar(im2)
pl.show()
15

png

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import matplotlib.pyplot as plt
from scipy import interpolate


fig = plt.figure()
ax = fig.gca(projection='3d')
ax2=fig.gca(projection='3d')
# Prepare arrays x, y, z
x = np.arange(-5.01, 5.01, 0.25)
y = np.arange(-5.01, 5.01, 0.25)
xx, yy = np.meshgrid(x, y)
z = np.sin(xx**2+yy**2)
f = interpolate.interp2d(x, y, z, kind='cubic')


xnew = np.arange(-5.01, 5.01, 1e-2)
ynew = np.arange(-5.01, 5.01, 1e-2)
znew = f(xnew, ynew)
ax.plot(x, y,z[0, :], 'ro-')
plt.show()

png

1
2
3
4
5
6
7
8
9
10
import matplotlib.pyplot as plt
from scipy.interpolate import splev, splrep
x = np.linspace(0, 10, 10)
y = np.sin(x)
spl = splrep(x, y)
print(spl)
x2 = np.linspace(0, 10, 200)
y2 = splev(x2, spl)
plt.plot(x, y, 'o', x2, y2)
plt.show()
(array([ 0.        ,  0.        ,  0.        ,  0.        ,  2.22222222,
        3.33333333,  4.44444444,  5.55555556,  6.66666667,  7.77777778,
       10.        , 10.        , 10.        , 10.        ]), array([-4.94881722e-18,  8.96543619e-01,  1.39407154e+00, -2.36640266e-01,
       -1.18324030e+00, -8.16301228e-01,  4.57836125e-01,  1.48720677e+00,
        1.64338775e-01, -5.44021111e-01,  0.00000000e+00,  0.00000000e+00,
        0.00000000e+00,  0.00000000e+00]), 3)

png

sklearn 实现拟合

sklearn 统一化的api可以实现单变量多变量的拟合。下面给出单变量多项式拟合例子,多元拟合可以参看参考文献[5]

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures

plt.figure() # 实例化作图变量
plt.title('single variable') # 图像标题
plt.xlabel('x') # x轴文本
plt.ylabel('y') # y轴文本
plt.axis([30, 400, 100, 400])
plt.grid(True) # 是否绘制网格线

X = [[50],[100],[150],[200],[250],[300]]
y = [[150],[200],[250],[280],[310],[330]]
X_test = [[250],[300]] # 用来做最终效果测试
y_test = [[310],[330]] # 用来做最终效果测试
plt.plot(X, y, 'o')

model = LinearRegression()
model.fit(X, y)
X2 = [[30], [400]]
y2 = model.predict(X2)
plt.plot(X2, y2, 'g-')

xx = np.linspace(30, 400, 100) # 设计x轴一系列点作为画图的x点集
quadratic_featurizer = PolynomialFeatures(degree=2) # 实例化一个二次多项式特征实例
X_train_quadratic = quadratic_featurizer.fit_transform(X) # 用二次多项式对样本X值做变换
xx_quadratic = quadratic_featurizer.transform(xx.reshape(xx.shape[0], 1)) # 把训练好X值的多项式特征实例应用到一系列点上,形成矩阵
regressor_quadratic = LinearRegression() # 创建一个线性回归实例
regressor_quadratic.fit(X_train_quadratic, y) # 以多项式变换后的x值为输入,代入线性回归模型做训练
plt.plot(xx, regressor_quadratic.predict(xx_quadratic), 'r') # 用训练好的模型作图
print("截距:",regressor_quadratic.intercept_)
print("二次多项式参数:",regressor_quadratic.coef_)
# 模型评价:
print ('一元线性回归 r-squared', model.score(X_test, y_test))
X_test_quadratic = quadratic_featurizer.transform(X_test)
print ('二次回归 r-squared', regressor_quadratic.score(X_test_quadratic, y_test))

# 展示图像
plt.show()
截距: [89.]
二次多项式参数: [[ 0.          1.295      -0.00164286]]
一元线性回归 r-squared 0.07555555555555149
二次回归     r-squared 0.9993367346938364

png

多元回归

statsmodels和sklearn一样其实都是可以实现多元回归的,不过不得不说statsmodels在输出这块上的逼格很高,看起来很专业。所以比较推荐使用statsmodels实现多元回归的参数拟合。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
import numpy as np
import statsmodels.api as sm

# 数据生成
nobs = 100
X = np.random.random((nobs, 3))
# sm 计算的时候需要提供一个截距,所以这一步是必须的
X = sm.add_constant(X)
beta = [1, .1, .5,.5]
e = np.random.random(nobs)
y = np.dot(X, beta) + e
# 调用OLS拟合算法,其他的算法可以参看参考文献[2],statsmodels提供了四种高效的拟合算法。
results = sm.OLS(y, X).fit()
# statsmodels 的输出十分专业,除了拟合的参数外,还有一些评价指标,如F,AIC,R-squared等
print(results.summary())
                            OLS Regression Results                            
==============================================================================
Dep. Variable:                      y   R-squared:                       0.361
Model:                            OLS   Adj. R-squared:                  0.341
Method:                 Least Squares   F-statistic:                     18.09
Date:                Fri, 28 Dec 2018   Prob (F-statistic):           2.20e-09
Time:                        11:38:53   Log-Likelihood:                -13.182
No. Observations:                 100   AIC:                             34.36
Df Residuals:                      96   BIC:                             44.78
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          1.4756      0.086     17.181      0.000       1.305       1.646
x1             0.1855      0.095      1.948      0.054      -0.004       0.374
x2             0.3950      0.101      3.900      0.000       0.194       0.596
x3             0.5127      0.100      5.116      0.000       0.314       0.712
==============================================================================
Omnibus:                       21.977   Durbin-Watson:                   1.866
Prob(Omnibus):                  0.000   Jarque-Bera (JB):                5.354
Skew:                           0.133   Prob(JB):                       0.0688
Kurtosis:                       1.898   Cond. No.                         5.97
==============================================================================

Warnings:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.

参考

[1] https://docs.scipy.org/doc/scipy/reference/interpolate.html
[2] http://www.statsmodels.org/stable/regression.html
[3] https://zhuanlan.zhihu.com/p/28149195
[4] https://docs.scipy.org/doc/numpy-1.15.0/reference/generated/numpy.polyfit.html
[5] https://ask.hellobi.com/blog/okajun/12252
[6] https://cloud.tencent.com/developer/article/1064376

-------------本文结束感谢您的阅读-------------