Patch to avoid superfluous output directories

In [1]:

import numpy as np
import matplotlib.pyplot as plt
import random
from math import pi
from numpy.linalg import inv
np.random.seed(1)

In [2]:

def prior(x1,x2,mu, ambda, sigma_k, samples):
    
    #defination a kernel function k(x1,x2) with hype-parameters length scale :ambda and sigma_k #
    matrix_norm = np.sum(x1**2, 1).reshape(-1, 1) + np.sum(x2**2, 1) - 2 * np.dot(x1, x2.T)
    k=sigma_k ** 2 * np.exp(-1 / ambda ** 2 * matrix_norm)
    
    # defination f_prior, draw random samples from a multivariate normal distribution with mean and corvanice k
    f_prior = np.random.multivariate_normal(mu.ravel(),k,samples)
    
    
    #estimate standard error(sd)
    sd=np.sqrt(np.diag(k))
    
    #estimate 95% CI
    ci_low=mu.ravel()-1.96 * sd
    ci_high=mu.ravel()+1.96 * sd

    # plot sample paths 
    plt.figure(figsize=(10,4), dpi=80)
    plt.figure(1)
    ax1 = plt.subplot(121)
    plt.plot(x1, f_prior.T)
    plt.grid()
    plt.xlabel("input, x")
    plt.ylabel("output, f(x)")
    plt.title("Sample paths from the Gaussian Prior")
    
    # plot mean and sd 
    ax2 = plt.subplot(122)
    plt.grid()
    ax2.plot(x1, mu, color='black', label='Mean')
    plt.fill_between(x1.ravel(), ci_high, ci_low, facecolor='grey', alpha=0.5)
    plt.legend()
    
    return f_prior

In [3]:

#generate training inputs with sample size N=10
N=10
X= np.linspace(start=-pi, stop=pi, num=N).reshape(-1,1)
mu = np.zeros(X.shape)
#generate test inputs
X_s= np.linspace(start=-pi, stop=pi, num=int(0.8*N)).reshape(-1,1)
# suppose zero mean
mu = np.zeros(X_s.shape)
#import inputs and hype-parameters: ambda=1,sigma_k=1,sample path=6
f_prior = prior(X_s, X_s, mu, 1.0, 1.0, 6)

In [ ]:

In [4]:

#defination a kernel function k(x1,x2)
def kernel (x1 ,x2, ambda, sigma_k):
    matrix_norm = np.sum(x1**2, 1).reshape(-1, 1) + np.sum(x2**2, 1) - 2 * np.dot(x1, x2.T)
    k=sigma_k ** 2 * np.exp(-1 / ambda ** 2 * matrix_norm)
    return k

In [5]:

# test inputs X_s, training inputs X, noise level noise_sigma.
def posterior(X_s, X, ambda, sigma_k, noise_sigma, samples): 
    k_train = kernel(X, X, ambda, sigma_k) # K(X,X)
    k_prior = kernel(X_s, X_s, ambda, sigma_k)  # K(X_star,X_star)
    k_post = kernel(X_s, X, ambda, sigma_k) # K(X_star,X)
    k_pt = kernel(X, X_s, ambda, sigma_k) # K_posterior_T=K(X,X_star)
    
    #define training outputs Y = f(X)+ epsilon and f(X)=sin(X), epsion ~ N(0, noise_sigma^2)
    epsilon=np.random.normal(loc=0, scale=noise_sigma, size=X.shape)
    Y = np.sin(X) + epsilon

    #via dot product kernel function and equation (18) in the disertation, get mean and kernel function
    mu_s = np.dot(np.dot(k_post, inv((k_train)+((noise_sigma**2)*np.eye(len(X))))), Y)
    k_s = k_prior - np.dot(np.dot(k_post, inv((k_train) + ((noise_sigma**2)*np.eye(len(X))))), k_pt) 
    f_posterior = np.random.multivariate_normal(mu_s[:,0],k_s, samples)
    
    #estimate standard error(SD) 95% CI
    SD = np.sqrt(np.diag(k_s))
    CI_low=mu_s.ravel()-1.96 * SD
    CI_high=mu_s.ravel()+1.96 * SD
    
    # plot sample paths 
    plt.figure(figsize=(10,4), dpi=80)
    plt.subplot(4,2,i+1)
    plt.subplots_adjust(top=2)
    plt.plot(X_s, f_posterior.T, alpha=0.5)
    plt.errorbar(X,Y, yerr=noise_sigma, color='black',fmt='o')
    plt.grid()
    plt.xlabel("input x, $k=\sigma_k^2 exp( -\Vert X-X^{'}\Vert_2^2 \, / \, \lambda^2)$ ")
    plt.ylabel("output, f(x)=sin x")
    #plt.title("Sample paths from the Gaussian Posterior with ambda, sigma_k, sigma)
    plt.title(f'$\lambda={ambda}$, $\sigma_k = {sigma_k}$')
    # plot mean and SD
    plt.figure(figsize=(10,4), dpi=80)
    plt.subplot(4,2,i+1)
    plt.subplots_adjust(top=2)
    plt.grid()
    plt.errorbar(X,Y, yerr=noise_sigma, color='black',fmt='o')
    plt.plot(X_s, mu_s.flat, color='black', label='Mean')
    plt.fill(np.concatenate([X_s, X_s[::-1]]),
    np.concatenate([CI_low,(CI_high)[::-1]]),alpha=0.5, fc='grey')
    plt.legend()
    
    return f_posterior

In [6]:

hype_params = [
    (0.1,0.2),
    (0.4,0.2),
    (0.7,0.4),
    (1.0,0.4),
]

for i,(ambda,sigma_k) in enumerate(hype_params):
    f_posterior = posterior(X_s, X, ambda=ambda, sigma_k=sigma_k, noise_sigma=1e-4, samples=6)

In [ ]:

In [7]:

# Estimate hyper-parameters in a non-standared RBF function via Maximum Likelihood Estimation (MLE): 
from scipy.optimize import minimize

class MLE:
    
    def __init__(self, optimization=True):
        self.is_fit = False
        self.X, self.Y = None, None
        self.hyparams = {"ambda": 0.5, "sigma_k": 0.2,"noise_sigma":1e-4}
        self.optimization = optimization
       
    def fitting(self, X, Y):
        # store train data
        self.X = np.array(X)
        self.Y = np.array(Y)
        
         # hyper parameters optimization
        def Negative_Log_Likelihood(hyparams):
            self.hyparams["ambda"], self.hyparams["sigma_k"] = hyparams[0], hyparams[1]
            Kyy = kernel(self.X, self.X,self.hyparams["ambda"],self.hyparams["sigma_k"]) + (self.hyparams["noise_sigma"]**2) * np.eye(len(self.X))
            # negative_log_likelihood
            NLL = 0.5 * self.Y.T.dot(np.linalg.inv(Kyy)).dot(self.Y) + 0.5 * np.linalg.slogdet(Kyy)[1] + 0.5 * len(self.X) * np.log(2 * np.pi)
            return NLL.ravel()
                
        if self.optimization:
            res = minimize(Negative_Log_Likelihood, [self.hyparams["ambda"], self.hyparams["sigma_k"]], 
                   bounds=((1e-4, 1e4), (1e-4, 1e4)),
                   method='L-BFGS-B')
            self.hyparams["ambda"], self.hyparams["sigma_k"] = res.x[0], res.x[1]
        
        self.is_fit = True
    
    def predict(self, X_s):
        if not self.is_fit:
            print("GPR Model not fit yet.")
            return
    
        X_s = np.asarray(X_s)
        k_train = kernel(self.X, self.X,self.hyparams["ambda"],self.hyparams["sigma_k"])  # K(X, X)
        k_prior = kernel(X_s, X_s,self.hyparams["ambda"],self.hyparams["sigma_k"])  # K(X_star,X_star)
        k_post = kernel(X_s, self.X,self.hyparams["ambda"],self.hyparams["sigma_k"]) # K(X_star,X)
        k_pt = kernel(self.X, X_s,self.hyparams["ambda"],self.hyparams["sigma_k"])  # K_posterior_T=K(X,X_star)
        k_train_inv = np.linalg.inv(k_train + (self.hyparams["noise_sigma"]**2) * np.eye(len(self.X))) 
        mu_s = np.dot(np.dot(k_post, inv((k_train)+((self.hyparams["noise_sigma"])*np.eye(len(X))))), Y)
        k_s = k_prior - np.dot(np.dot(k_post, inv((k_train) + ((self.hyparams["noise_sigma"])*np.eye(len(X))))), k_pt) 
       
    #estimate standard error(SD) 95% CI
        SD = np.sqrt(np.diag(k_s))
        CI_low=mu_s.ravel()-1.96 * SD
        CI_high=mu_s.ravel()+1.96 * SD
        return mu_s,k_s, CI_low, CI_high
    
def y(x, noise_sigma):
    x = np.asarray(x)
    epsilon=np.random.normal(loc=0, scale=noise_sigma, size=x.shape) 
    y = np.sin(x) + epsilon
    return y.tolist() 

Y = y(X, noise_sigma=1e-4)
gpr = MLE(optimization=True)
gpr.fitting(X, Y)
mu_s,k_s, CI_low, CI_high = gpr.predict(X_s)
Y_s = mu_s.ravel()

plt.figure()
plt.grid()
plt.title("$\lambda$=%.1f, $\sigma_k$=%.1f" % (gpr.hyparams["ambda"], gpr.hyparams["sigma_k"]))
plt.fill_between(X_s.ravel(), CI_high, CI_low, alpha=0.1,label='95% confidence interval')
plt.plot(X_s, Y_s, label="predict")
plt.scatter(X, Y, label="Train points", c="black", marker="o")
plt.xlabel("Test input X_s, $k=\sigma_k^2 exp( -\Vert X-X^{'}\Vert_2^2 \, / \, \lambda^2)$ ")
plt.ylabel("Test output, Y_s=sin X_s+$\epsilon$")
plt.legend()

Out[7]:

<matplotlib.legend.Legend at 0x7fee8b94c5e0>

In [8]:

#method 2 : hyper-parameters optimization using a #Scikit-Learn library（ MLE fitting）

#Scikit-Learn
from sklearn.gaussian_process import GaussianProcessRegressor as GPR
from sklearn.gaussian_process.kernels import RBF,ConstantKernel as C

def y(X, noise_sigma):
    X = np.asarray(X)
    epsilon=np.random.normal(loc=0, scale=noise_sigma, size=X.shape) 
    Y = np.sin(X) + epsilon
    return Y.tolist() 

Y = y(X, noise_sigma=1e-4)

# fit GPR,instantiate a Gaussian Process mode to estimate hyper-parameters
#kernel(C,lambda)=Constant kernel*RBF with initial values of constant and lamda, and their limits.
#because of our kernel function is not a Constant*RBF(lambda), which can be replaced by (sigma_k^2)*RBF(sqrt_ambda).
#hence constant_value=sigma_k^2;sqrt(2)*legtth_scale= ambda.here, their initial values correspond to sigma_k and ambda in previous Method 1.
kernel = C(constant_value=0.25, constant_value_bounds=(1e-4, 1e4)) * RBF(length_scale=0.3, length_scale_bounds=(1e-4, 1e4))
gpr = GPR(kernel=kernel, n_restarts_optimizer=2)

#Fitting the data via MLE
gpr.fit(X, Y)
mu_s, k_s = gpr.predict(X_s, return_cov=True)
Y_s = mu_s.ravel()
#estimate standard error(SD) 95% CI
SD = np.sqrt(np.diag(k_s))
CI_low=Y_s-1.96 * SD
CI_high=Y_s+1.96 * SD

# plotting
plt.figure()
plt.grid()
plt.title("$\lambda$=%.1f, $\sigma_k$=%.1f" % ((2**0.5)*gpr.kernel_.k2.length_scale, gpr.kernel_.k1.constant_value**0.5))
plt.fill_between(X_s.ravel(), CI_high, CI_low, alpha=0.1,label='95% confidence interval')
plt.plot(X_s, Y_s, label="predict")
plt.xlabel("Test input X_s, $k=\sigma_k^2 exp( -\Vert X-X^{'}\Vert_2^2 \, / \, \lambda^2)$ ")
plt.ylabel("Test output, Y_s=sin X_s+$\epsilon$")
plt.scatter(X, Y, label="train", c="black", marker="o")
plt.legend()

Out[8]:

<matplotlib.legend.Legend at 0x7fee8ab92c10>

In [ ]:

## above two methods to estimate the optimal hyper-parameters,which are almost the same. 

In [ ]:

From:	Al Haji-Ali
Subject:	Patch to avoid superfluous output directories
Date:	Fri, 23 Jul 2021 18:13:13 +0100
User-agent:	mu4e 1.5.13; emacs 27.2