import numpy as np
import matplotlib.pyplot as plt
a = 0.5 # coefficient
b = 3 # intercept
noiseVar = 5 # noise的标准差
x = np.arange(1, 100, 1)
# generate data
y = a * x + b + noiseVar* np.random.randn(x.size)
# 画出数据的散点图
plt.plot(x, y, 'o', color='r')
plt.xlabel('Population')
plt.ylabel('House price')
可以得到以下分布图:
我们用bootstrap的方法来估计参数的可靠性(reliability)
nBoot = 1000 # bootstrap多少次
res = np.empty((nBoot, 2))
for i in range(nBoot):
# sample with replacement data
idx = np.random.choice(np.arange(y.size), y.size, replace=True)
# deal with x
x2 = np.vstack((x[idx], np.ones(x.size))) # 注意这里x的idx也要改变
# fit the square
res[i, :] = np.linalg.lstsq(x2.T, y[idx], rcond=None)[0]
import numpy as np
import matplotlib.pyplot as plt
a = 0.5 # coefficient
b = 3 # intercept
noiseVar = 5 # noise的标准差
x = np.arange(1, 1000, 0.1)
# generate data
y = a * x + b + noiseVar* np.random.randn(x.size)
plt.plot(x, y, 'o', color='r')
得到的散点图为:
现在用原来10倍的数据,再来用bootstrap来估计原来的reliability
nBoot = 1000 # bootstrap多少次
res = np.empty((nBoot, 2))
for i in range(nBoot):
# sample with replacement data
idx = np.random.choice(np.arange(y.size), y.size, replace=True)
# deal with x
x2 = np.vstack((x[idx], np.ones(x.size))) # 注意这里x的idx也要改变
# fit the square
res[i, :] = np.linalg.lstsq(x2.T, y[idx], rcond=None)[0]
import numpy as np
import matplotlib.pyplot as plt
a = 0.05 # coefficient
b = 3 # intercept
noiseVar = 5 # noise的标准差
x = np.arange(1, 100, 0.1)
# generate data
y = a * x + b + noiseVar* np.random.randn(x.size)
然后我们来fit一条直线
x2 = np.vstack((x, np.ones(x.size))) # 注意这里的x不用利用idx而改变
res = np.linalg.lstsq(x2.T, y, rcond=None)[0]
a = res[0]
b = res[1]
print(res)
得到(a,b)的结果是:
[0.03829224 3.6987213 ]
我们用permutation的方法来得到线性系数的显著性p-value
nBoot = 10000 # permutation 多少次
res2 = np.empty((nBoot, 2))
for i in range(nBoot):
# sample without replacement data
idx = np.random.choice(np.arange(y.size), y.size, replace=False)
# deal with x
x2 = np.vstack((x, np.ones(x.size))) # 注意这里的x不用利用idx而改变
# fit the square
res2[i, :] = np.linalg.lstsq(x2.T, y[idx], rcond=None)[0]