由之前的章节得到心理学中强化学习的最简递推公式:Vt=Vt−1+a∗(rt−Vt−1),被称为简单强化学习模型(navie reinforcement learning model)。但是简单的强化学习(PEi = ri − ;i = Vi -1+ )无法解释以上三种条件反射现象。
以下通过Python代码模拟基于简单强化学习模型的遮蔽效应的形成过程。
# Naïve modelfrom scipy.stats import bernoulliimport numpy as npimport matplotlib.pyplot as pltdefovershadow(T):''' T: number of trials ''' r = np.ones(T)# (T,)维全是1的数组 A = np.ones(T)# (T,)维全是1的数组 B = np.ones(T)# (T,)维全是1的数组return r, A, Ba =0.05# learning rateT =100# number of trialsr, A, B =overshadow(T)Va = np.empty(T)Vb = np.empty(T)for i inrange(T):# loop trialsif i ==0: Va[0]=0+ a*(r[0]-0) Vb[0]=0+ a*(r[0]-0)else: Va[i]= Va[i-1]+ a*(r[i]-Va[i-1]) Vb[i]= Vb[i-1]+ a*(r[i]-Vb[i-1])plt.plot(r, '-k', label='reward')plt.plot(Va, linewidth=10, label='Va')plt.plot(Vb, linewidth=2, label='Vb')plt.xlabel('Trials')plt.ylabel('Value')plt.legend()```
# RW model-Overshadowinga =0.05# learning rateT =100# how many trialsr, A, B =overshadow(T)Va = np.empty(T)Vb = np.empty(T)for i inrange(T):# loop trialsif i ==0: Va[0]=0+ a*(r[0]-0* A[i]-0* B[i]) Vb[0]=0+ a*(r[0]-0* A[i]-0* B[i])else: PE = r[i]-(Va[i-1]*A[i]+Vb[i-1]*B[i]) Va[i]= Va[i-1]+ a*PE * A[i] Vb[i]= Vb[i-1]+ a*PE * B[i]plt.plot(r, '-k', label='reward')plt.plot(Va, linewidth=10, label='Va')plt.plot(Vb, linewidth=2, label='Vb')plt.xlabel('Trials')plt.ylabel('Value')plt.legend()
defblock(T): T1 =round(T/2)# round是取整数的意思# phase 1: A -> R r1 = np.ones(T1) A1 = np.ones(T1) B1 = np.zeros(T1)# phase 2: A, B -> R r2 = np.ones(T-T1) A2 = np.ones(T-T1) B2 = np.ones(T-T1)# combine two phases r = np.hstack((r1, r2)) A = np.hstack((A1, A2)) B = np.hstack((B1, B2))return r, A, Ba =0.05# learning rateT =1000# how many trialsr, A, B =block(T)Va = np.empty(T)Vb = np.empty(T)for i inrange(T):# loop trialsif i ==0: Va[0]=0+ a*(r[0]-0* A[i]-0* B[i]) Vb[0]=0+ a*(r[0]-0* A[i]-0* B[i])else: PE = r[i]-Va[i-1]*A[i]-Vb[i-1]*B[i] Va[i]= Va[i-1]+ a*PE * A[i] Vb[i]= Vb[i-1]+ a*PE * B[i]plt.plot(r, '-k', label='reward')plt.plot(Va, linewidth=10, label='Va')plt.plot(Vb, linewidth=2, label='Vb')plt.xlabel('Trials')plt.ylabel('Value')plt.legend()
definhibition(T): r = np.empty(T) A = np.empty(T) B = np.empty(T)for i inrange(T):if bernoulli.rvs(0.5, 1)==1:# A -> R trial r[i]=1 A[i]=1 B[i]=0else:# A,B -> 0 trial r[i]=0 A[i]=1 B[i]=1return r, A, Ba =0.05# learning rateT =1000# how many trialsr, A, B =inhibition(T)Va = np.empty(T)Vb = np.empty(T)for i inrange(T):# loop trialsif i ==0: Va[0]=0+ a*(r[0]-0* A[i]-0* B[i]) Vb[0]=0+ a*(r[0]-0* A[i]-0* B[i])else: PE = r[i]-Va[i-1]*A[i]-Vb[i-1]*B[i] Va[i]= Va[i-1]+ a*PE * A[i] Vb[i]= Vb[i-1]+ a*PE * B[i]#plt.plot(r, '-k', label='reward')plt.plot(Va, linewidth=10, label='Va')plt.plot(Vb, linewidth=2, label='Vb')plt.xlabel('Trials')plt.ylabel('Value')plt.legend()
基于此,1972年Robert A. Rescorla和Allan R. Wagner提出了瑞思考勒-瓦格纳(Rescorlar-Wagner, RW)模型。根据RW模型,当多个刺激存在的时候,预测误差(Prediction error)应该为奖赏减去所有出现刺激的价值( PEi=ri−∑KNVK,其中第i个trial,一种出现N个刺激)。在更新刺激的价值时,只更新出现了刺激的价值。因此对于同时出现两种刺激的刺激价值的更新过程为: