Load packages

using Pkg
Pkg.activate(".")
Pkg.Registry.add(RegistrySpec(url = "https://github.com/RoyCCWang/RWPublicJuliaRegistry")) # where MaternRegression.jl is registered.
let
    pkgs = ["StaticArrays", "MaternRegression", "CSV", "DataFrames", "PythonPlot",
    ]
    for pkg in pkgs
        #check if package is in current environment.
        if Base.find_package(pkg) === nothing

            #install package.
            Pkg.add(pkg)
        end
    end
end

import Random
Random.seed!(25)

using LinearAlgebra
using Statistics
import MaternRegression as GS;

  Activating project at `~/Documents/repo/MaternRegression.jl/examples`
     Cloning registry from "https://github.com/RoyCCWang/RWPublicJuliaRegistry"
Registry `RWPublicJuliaRegistry` already exists in `~/.julia/registries/RWPublicJuliaRegistry`.

When we import Metaheuristics, the add-on module in MaternRegression for hyperparameter optimization also gets loaded.

import Metaheuristics as EVO;

for loading the data.

import CSV
import DataFrames as DF
import Dates;

Reset plot figures.

import PythonPlot as PLT
fig_num = 1;

Specify floating-point data type.

T = Float64;

Load data

Get all the data from the csv into a data frame.

function get_toronto_station()
    return "CA006158665"
end

function load_data(station_name)
    data_path = joinpath("data", "$(station_name).csv")
    return CSV.read(data_path, DF.DataFrame)
end

function reverse_standardization(yq::AbstractVector, m, s)
    return collect(s*yq[n] + m for n in eachindex(yq))
end

df_all = load_data(get_toronto_station());

Remove daily records that have missing maximum temperature.

df_tmax = filter(xx->!ismissing(xx.TMAX), df_all);

The temperature measurements needs to be divided by 10 to get Celcius units.

N = length(df_tmax.TMAX )
y0 = collect( convert(T, x/10) for x in df_tmax.TMAX );

y is the set of raining outputs.

mean_y = mean(y0)
std_y = std(y0)
y = (y0 .- mean_y) ./ std_y;

Convert dates to integers, and use as training inputs.

ts_dates = df_tmax.DATE
ts0 = collect( convert(T,  d |> Dates.value) for d in ts_dates );

Work with elapsed days as the independent variable. ts is the set of training inputs.

ts = ts0 .- minimum(ts0)
offset_date = ts_dates[begin];

Hyperparameter optimization

The optimization algorithm tries not to exceed this number of marginal likelihood evaluations:

f_calls_limit = 10_000;

GS.InferGain() constructs a dispatch data type that tells the algorithm that the variable ordering is: [λ, σ², b]

trait = GS.InferGain();

These are the optimization variable lower and upper bounds, respectively.

optim_lbs = convert(Vector{T}, [1e-5; 1e-5; 1e-5])
optim_ubs = convert(Vector{T}, [1.0; 1.0; 1.0]);

Solve the hyperparameter optimization problem. You need to pass in the module alias EVO so that MaternRegression can check if the pre-requisite dependencies for the hyperparameter optimization package extension can be loaded.

evo_sol_vars, evo_sol_cost = GS.hp_optim(
    GS.UseMetaheuristics(EVO),
    trait,
    ts, y, optim_lbs, optim_ubs;
    f_calls_limit = f_calls_limit,
    #p0s = initial_guesses, #include initial guesses here. These should be of type `Vector{Vector{T}}`.
);

To load results from disk, uncomment the following:

##Save to disk, for later use with the QMD file.
#using Serialization
#serialize("results/optim", (evo_sol_vars, evo_sol_cost));

To load results from disk, uncomment the following:

##Load from disk, make sure there were no errors in loading.
#evo_sol_vars, evo_sol_cost = deserialize("results/optim");

Query and visualize

Choose 5000 uniformly spaced time stamps across the in-fill interval, then pick the first window_len of them as the query positions.

Nq = 5000
window_len = 50
tqs = LinRange(minimum(ts), maximum(ts), Nq)[1:window_len];

Parse the hyperparameter optimization results to a Matern kernel, θ_sde_query, and the observation variance, σ².

θ_sde_query, σ² = GS.parse_ml_result(trait, evo_sol_vars);

Run GPR predictive inference to get predictive means mqs and predictive variances vqs.

sde_gp = GS.create_sdegp(θ_sde_query, ts, y, σ²) # cached phase.
mqs, vqs = GS.query_sdegp(T, sde_gp, tqs, ts); # query phase.

The preditive standard deviation.

sqs = sqrt.(vqs);

prepare for display.

inds = findall(xx->xx<tqs[window_len], ts)
ts_display = ts[inds]
y_display = reverse_standardization(y[inds], mean_y, std_y)

mqs = reverse_standardization(mqs, mean_y, std_y)
sqs = sqs .* std_y;

Set the shaded region to be 3 standard deviations from the mean.

plot_uq_amount = 3 .* sqs;

Visualize.

fig_size = (6, 4) # units are in inches.
dpi = 300

PLT.figure(fig_num; figsize = fig_size, dpi = dpi)
fig_num += 1

PLT.scatter(ts_display, y_display, s = 20, label = "Data")
PLT.plot(tqs, mqs, label = "Predictive mean")

PLT.fill_between(
    tqs,
    mqs - plot_uq_amount,
    mqs + plot_uq_amount,
    alpha = 0.3,
    label = "3 standard deviations"
)

PLT.xlabel("Days elapsed since $offset_date", fontsize = 12)
PLT.ylabel("Temperature (Celcius)", fontsize = 12)
PLT.legend()
PLT.show() #Not needed if run in the Julia REPL.
PLT.gcf() #Required for Literate.jl

This page was generated using Literate.jl.