Feature Importance/Selection

This file is best viewed in a Pluto notebook. To do so, from the MagNav.jl directory, run:

julia> using Pluto
julia> Pluto.run() # select & open notebook

This is a reactive notebook, so feel free to change any parameters of interest, like adding/removing features to the model, switching between linear models (:TL, :TL_mod, :elasticnet, :plsr), or different training & testing lines.

Import packages and DataFrames

The DataFrames listed below provide useful information about the flight data collected by Sander Geophysics Ltd. (SGL) & magnetic anomaly maps.

Dataframe	Description
`df_map`	map files relevant for SGL flights
`df_cal`	SGL calibration flight lines
`df_flight`	SGL flight files
`df_all`	all flight lines
`df_nav`	all navigation-capable flight lines
`df_event`	pilot-recorded in-flight events

begin
    cd(@__DIR__)
    # uncomment line below to use local MagNav.jl (downloaded folder)
    # using Pkg; Pkg.activate("../"); Pkg.instantiate()
    using MagNav
    using CSV, DataFrames
    using Plots: plot, plot!
    using Random: seed!
    using Statistics: mean, median, std
    seed!(33); # for reproducibility
    include("dataframes_setup.jl"); # setup DataFrames
end;

Select magnetometers & parameters for compensation.

begin # try modifying these parameters
    features = [:mag_4_uc,:mag_4_uc_dot,:mag_4_uc_dot4,:TL_A_flux_a]
    use_mag  = :mag_4_uc
    use_vec  = :flux_d
    terms    = [:p3,:i3,:e3]
end;

comp_params_init = NNCompParams(features_setup = features,
                                model_type     = :m1,
                                y_type         = :d,
                                use_mag        = use_mag,
                                use_vec        = use_vec,
                                terms          = terms,
                                epoch_adam     = 100);

Select training & testing flights from Flight 1006 (see readme).

begin
    lines_train = [1006.03, 1006.04, 1006.05, 1006.06]
    lines_test  = [1006.08]
end;

Perform neural network-based calibration using training data & extract trained neural network (NN) compensation model. The full list of SGL flights is in df_flight, the full list of maps is in df_map, & the full list of flight lines is in df_all.

begin
    (comp_params,_,_,_,feats) =
        comp_train(comp_params_init,lines_train,df_all,df_flight,df_map)
    m = comp_params.model # extract trained NN model
end;

Get training & testing data & normalize by feature (columns). Typically this is done internally, but shown here to better explain feature importance/selection. The full list of navigation-capable flight lines is in df_nav.

begin
    (_,x_train,y_train,_,_,_) =
        MagNav.get_Axy(lines_train,df_all,df_flight,df_map,features;
                       use_mag=use_mag,use_vec=use_vec,terms=terms)
    (_,x_test,y_test,_,_,_) =
        MagNav.get_Axy(lines_test ,df_nav,df_flight,df_map,features;
                       use_mag=use_mag,use_vec=use_vec,terms=terms)
    (x_bias,x_scale,x_train_norm,x_test_norm) = norm_sets(x_train,x_test)
    (y_bias,y_scale,y_train_norm,y_test_norm) = norm_sets(y_train,y_test)
end;

Shapley-based feature importance

Determine & plot Shapley effects.

begin
    N_shap     = length(y_test_norm) # number of samples to use for explanation
    range_shap = 1:12                # (ranked) features to plot
    (df_shap,baseline_shap) = eval_shapley(m,Float32.(x_test_norm),feats,N_shap)
    p1 = plot_shapley(df_shap,baseline_shap,range_shap)
end

Global sensitivity analysis (GSA)-based feature importance

List of most important features.

begin
    using DataFrames: sort
    N_gsa  = length(y_test_norm) # number of samples to use for explanation
    means  = eval_gsa(m,Float32.(x_test_norm),N_gsa)
    df_gsa = sort(DataFrame(feature=feats,means=means),:means,by=abs,rev=true)
end

	feature	means
1	:mag_4_uc	-1.06289
2	:mag_4_uc_dot	0.281907
3	:TL_A_flux_a_4	-0.180772
4	:TL_A_flux_a_3	-0.175792
5	:TL_A_flux_a_1	-0.144579
6	:TL_A_flux_a_8	0.0897779
7	:TL_A_flux_a_6	-0.0443019
8	:mag_4_uc_dot4	-0.018629
9	:TL_A_flux_a_5	-0.0153514
10	:TL_A_flux_a_9	-0.0132861
11	:TL_A_flux_a_7	-0.00681334
12	:TL_A_flux_a_2	0.00530945

Sparse group Lasso (SGL)-based feature importance

List of most important features.

begin
    α_sgl = 0.5
    λ_sgl = 1e-5
    comp_params_sgl_init = NNCompParams(comp_params_init,α_sgl=α_sgl,λ_sgl=λ_sgl)
    comp_params_sgl =
        comp_train(comp_params_sgl_init,lines_train,df_all,df_flight,df_map)[1]
    m_sgl  = comp_params_sgl.model # extract trained NN model
    w_sgl  = comp_params_sgl.data_norms[3]*MagNav.sparse_group_lasso(m_sgl,1)
    df_sgl = sort(DataFrame(feature=feats,w_norm=w_sgl),:w_norm,by=abs,rev=true)
    # m_sgl_  = comp_params.model # extract trained NN model
    # w_sgl_  = comp_params.data_norms[3]*MagNav.sparse_group_lasso(m_sgl_,1)
    # df_sgl_ = sort(DataFrame(feature=feats,w_norm=w_sgl_),:w_norm,by=abs,rev=true)
end

	feature	w_norm
1	:mag_4_uc	1.29958
2	:TL_A_flux_a_1	0.956714
3	:TL_A_flux_a_2	0.891232
4	:TL_A_flux_a_7	0.735043
5	:TL_A_flux_a_5	0.734035
6	:TL_A_flux_a_3	0.644708
7	:mag_4_uc_dot	0.613843
8	:TL_A_flux_a_4	0.610832
9	:TL_A_flux_a_8	0.556717
10	:TL_A_flux_a_6	0.344825
11	:TL_A_flux_a_9	0.310025
12	:mag_4_uc_dot4	0.197108