This vignette demonstrates how to get started with the
leaf R package, a high-level interface to the LEAF Symbolic
Regression framework.
Note: Before using
leaf, you need to install the Python backend by runningleaf::install_leaf().
train_data <- leaf_data("eg_train")
test_data <- leaf_data("eg_test")
print(head(train_data))
#> X x1 x2 y
#> 1 12 37.45222 1.51908854 114.625592
#> 2 4 26.02808 0.07787298 4.804276
#> 3 37 9.09870 0.17996103 3.780866
#> 4 8 26.62270 0.52217593 28.528340
#> 5 3 33.38746 1.56938555 105.608897
#> 6 6 29.72694 1.35455366 81.320448
print(head(test_data))
#> X x1 x2 y
#> 1 13 10.961724 1.1329768 25.40347
#> 2 39 24.859367 0.6104367 31.09740
#> 3 30 29.766229 1.1632080 70.04424
#> 4 45 38.345487 1.4681886 113.42693
#> 5 17 5.582007 1.6563170 18.89703
#> 6 48 38.867698 1.4879493 116.50896The formula specifies the transformed inputs and grouping structure.
Functional transformations are allowed inside the formula.
search_results <- regressor$search_equations(
data = train_data,
formula = formula
)
#> 1. Processing data for equation search based on formula...
#> 2. Running engine 'rsrm' over 1 folds using up to 1 processes...
#> -- FINAL RESULTS --
#> Episode: 4/4
#> time: 60.08s
#> loss: 0.00014448195748697755
#> form: X2/X1+1-X1+F
#> HOF:
#> equation complexity loss
#> 0 0 0 999999999999999967336168804116691273849533185806555472917961779471295845921727862608739868455469056.00
#> 1 44.5014 1 1292.93
#> 2 33.4330*X2 2 450.49
#> 3 17.8258*X1*X2 3 0.15
#> 4 17.6940*X1*X2 + 0.5466 4 0.03
#> 5 X1*(17.6544*X2 + 0.3209) 5 0.02
#> 6 17.6484*X1*X2 + 0.2050*X1 + 0.2815 6 0.01
#> 7 17.6599*X1*X2 + 0.7257 - 0.0709/X1 7 0.01
#> 8 X1*(17.6404*X2*(X1 + 0.7876) + 1.0622)/(X1 + 0.7876) 8 0.00
#> 9 17.6249*X1*X2 + 0.1456*X1 + 0.0418*X2 + 0.4554 - 0.0461/X1 11 0.00
#> 10 -0.0423*X1**2 + 17.6418*X1*X2 + 0.2866*X1 + 0.3691 - 0.0330/X1 12 0.00
#> 11 17.6423*X1*X2 + 0.0970*X1 + 0.5986 - 0.1043/X1 + 0.0048/X1**2 13 0.00
#> ---
#>
task:dataset_923a85b0-5900-4aa2-a712-ab798821ae56 expr:17.642277203891876*X1*X2 + 0.09700466267074608*X1 + 0.5985664260163593 + -0.10427594286280444/X1 + 0.004757006682361494/X1**2 Loss_MSE:0.00 Test 0/1.
#> final result:
#> success rate : 0%
#> average discovery time is 60.086 seconds
#> Number of equations looked at (per test) [Total, Timed out, Successful]: [[3955, 0, 3955]]
#> 3. Found 12 raw skeletons. Deduplicating...
head(search_results)
#> Equation Complexity
#> 0 β1 1
#> 1 β1⋅x2 2
#> 2 β1⋅x1⋅x2 3
#> 3 x1⋅(β1⋅x2 + β2) 4
#> 4 β1⋅x1⋅x2 + β2 4
#> 5 β1⋅x1⋅x2 + β2⋅x1 + β3 6fit_results <- regressor$fit(
data = train_data,
do_cv = FALSE
)
#> Fitting parameters for 11 equations...
#> Parameter fitting complete.
head(fit_results)
#> Equation Complexity Loss
#> 0 β1 1 1.292927e+03
#> 1 β1⋅x2 2 4.504898e+02
#> 2 β1⋅x1⋅x2 3 1.513596e-01
#> 3 x1⋅(β1⋅x2 + β2) 4 2.168116e-02
#> 4 β1⋅x1⋅x2 + β2 4 3.160112e-02
#> 5 β1⋅x1⋅x2 + β2⋅x1 + β3 6 6.814638e-03eval_table <- regressor$evaluate(
metrics = c("RMSE", "R2"),
data = test_data
)
head(eval_table)
#> Equation Complexity Loss RMSE
#> 0 β1 1 1.292927e+03 38.53714791
#> 1 β1⋅x2 2 4.504898e+02 29.85585277
#> 2 β1⋅x1⋅x2 3 1.513596e-01 0.37579394
#> 3 x1⋅(β1⋅x2 + β2) 4 2.168116e-02 0.20061360
#> 5 β1⋅x1⋅x2 + β2⋅x1 + β3 6 6.814638e-03 0.08439777
#> 6 β1⋅x1⋅x2 + β2 + -1⋅β3⋅x1^-1 7 6.803462e-03 0.06765065
#> R2
#> 0 -0.01053178
#> 1 0.39347348
#> 2 0.99990391
#> 3 0.99997262
#> 5 0.99999515
#> 6 0.99999689print(regressor$get_pareto_front())
#> Equation Complexity
#> 0 β1 1
#> 1 β1⋅x2 2
#> 2 β1⋅x1⋅x2 3
#> 3 x1⋅(β1⋅x2 + β2) 4
#> 5 β1⋅x1⋅x2 + β2⋅x1 + β3 6
#> 6 β1⋅x1⋅x2 + β2 + -1⋅β3⋅x1^-1 7
#> 7 x1⋅(β1⋅x2⋅(β2 + x1) + β3)⋅(β4 + x1)^-1 9
#> 8 β1⋅x1⋅x2 + β2⋅x1 + β3⋅x2 + β4 + -1⋅β5⋅x1^-1 11
#> 9 -1⋅β1⋅x1^2 + β2⋅x1⋅x2 + β3⋅x1 + β4 + -1⋅β5⋅x1^-1 12
#> 10 β1⋅x1⋅x2 + β2⋅x1 + β3 + -1⋅β4⋅x1^-1 + β5⋅x1^2^-1 13
#> Loss RMSE R2
#> 0 1.292927e+03 38.53714791 -0.01053178
#> 1 4.504898e+02 29.85585277 0.39347348
#> 2 1.513596e-01 0.37579394 0.99990391
#> 3 2.168116e-02 0.20061360 0.99997262
#> 5 6.814638e-03 0.08439777 0.99999515
#> 6 6.803462e-03 0.06765065 0.99999689
#> 7 4.223240e-04 0.01968863 0.99999974
#> 8 3.611868e-04 0.03950045 0.99999894
#> 9 1.582007e-04 0.01388075 0.99999987
#> 10 1.444818e-04 0.01629805 0.99999982results <- regressor$get_results()
plot_pareto(results, 'Complexity', 'RMSE', y_bigger_better = FALSE)
#> $plot#>
#> $pareto_points
#> Equation Complexity
#> 0 β1 1
#> 1 β1⋅x2 2
#> 2 β1⋅x1⋅x2 3
#> 4 β1⋅x1⋅x2 + β2 4
#> 5 β1⋅x1⋅x2 + β2⋅x1 + β3 6
#> 6 β1⋅x1⋅x2 + β2 + -1⋅β3⋅x1^-1 7
#> 7 x1⋅(β1⋅x2⋅(β2 + x1) + β3)⋅(β4 + x1)^-1 9
#> 9 -1⋅β1⋅x1^2 + β2⋅x1⋅x2 + β3⋅x1 + β4 + -1⋅β5⋅x1^-1 12
#> Loss RMSE R2 .pareto
#> 0 1.292927e+03 38.53714791 -0.01053178 TRUE
#> 1 4.504898e+02 29.85585277 0.39347348 TRUE
#> 2 1.513596e-01 0.37579394 0.99990391 TRUE
#> 4 3.160112e-02 0.09294067 0.99999412 TRUE
#> 5 6.814638e-03 0.08439777 0.99999515 TRUE
#> 6 6.803462e-03 0.06765065 0.99999689 TRUE
#> 7 4.223240e-04 0.01968863 0.99999974 TRUE
#> 9 1.582007e-04 0.01388075 0.99999987 TRUE
#>
#> $all
#> Equation Complexity
#> 0 β1 1
#> 1 β1⋅x2 2
#> 2 β1⋅x1⋅x2 3
#> 3 x1⋅(β1⋅x2 + β2) 4
#> 4 β1⋅x1⋅x2 + β2 4
#> 5 β1⋅x1⋅x2 + β2⋅x1 + β3 6
#> 6 β1⋅x1⋅x2 + β2 + -1⋅β3⋅x1^-1 7
#> 7 x1⋅(β1⋅x2⋅(β2 + x1) + β3)⋅(β4 + x1)^-1 9
#> 8 β1⋅x1⋅x2 + β2⋅x1 + β3⋅x2 + β4 + -1⋅β5⋅x1^-1 11
#> 9 -1⋅β1⋅x1^2 + β2⋅x1⋅x2 + β3⋅x1 + β4 + -1⋅β5⋅x1^-1 12
#> 10 β1⋅x1⋅x2 + β2⋅x1 + β3 + -1⋅β4⋅x1^-1 + β5⋅x1^2^-1 13
#> Loss RMSE R2 .pareto
#> 0 1.292927e+03 38.53714791 -0.01053178 TRUE
#> 1 4.504898e+02 29.85585277 0.39347348 TRUE
#> 2 1.513596e-01 0.37579394 0.99990391 TRUE
#> 3 2.168116e-02 0.20061360 0.99997262 FALSE
#> 4 3.160112e-02 0.09294067 0.99999412 TRUE
#> 5 6.814638e-03 0.08439777 0.99999515 TRUE
#> 6 6.803462e-03 0.06765065 0.99999689 TRUE
#> 7 4.223240e-04 0.01968863 0.99999974 TRUE
#> 8 3.611868e-04 0.03950045 0.99999894 FALSE
#> 9 1.582007e-04 0.01388075 0.99999987 TRUE
#> 10 1.444818e-04 0.01629805 0.99999982 FALSE
Note on Task Types: The find_optimal_threshold() method is specifically designed for binary classification tasks. If your target variable is continuous (Regression), you should use the raw predictions directly. If your target is binary (Classification), this step helps transform continuous scores into class labels (0 or 1).
# Save current analysis
path <- tempfile(fileext = ".pkl")
regressor$save(path)
#> SymbolicRegressor instance successfully saved to C:\Users\marti\AppData\Local\Temp\Rtmp4kIIBE\file6dcc18c269b1.pkl
# Load it later
regressor_loaded <- load_leaf_analysis(path)
#> SymbolicRegressor instance loaded successfully from C:\Users\marti\AppData\Local\Temp\Rtmp4kIIBE\file6dcc18c269b1.pkl
# Make predictions with loaded model
preds_loaded <- regressor_loaded$predict(
data = test_data,
equation_id = 3
)
head(preds_loaded)
#> [1] 25.12047 30.96778 70.01509 113.60330 18.63938 116.68777