% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/model_preprocessing.R
\name{model_preprocess}
\alias{model_preprocess}
\title{Automate Data Preprocess for Modeling}
\usage{
model_preprocess(
  df,
  y = "tag",
  ignore = NULL,
  train_test = NA,
  split = 0.7,
  weight = NULL,
  target = "auto",
  balance = FALSE,
  impute = FALSE,
  no_outliers = TRUE,
  unique_train = TRUE,
  center = FALSE,
  scale = FALSE,
  thresh = 10,
  seed = 0,
  quiet = FALSE
)
}
\arguments{
\item{df}{Dataframe. Dataframe containing all your data, including
the independent variable labeled as \code{'tag'}. If you want to define
which variable should be used instead, use the \code{y} parameter.}

\item{y}{Character. Column name for independent variable.}

\item{ignore}{Character vector. Force columns for the model to ignore}

\item{train_test}{Character. If needed, \code{df}'s column name with 'test'
and 'train' values to split}

\item{split}{Numeric. Value between 0 and 1 to split as train/test
datasets. Value is for training set. Set value to 1 to train with all
available data and test with same data (cross-validation will still be
used when training). If \code{train_test} is set, value will be overwritten
with its real split rate.}

\item{weight}{Column with observation weights. Giving some observation a
weight of zero is equivalent to excluding it from the dataset; giving an
observation a relative weight of 2 is equivalent to repeating that
row twice. Negative weights are not allowed.}

\item{target}{Value. Which is your target positive value? If
set to \code{'auto'}, the target with largest \code{mean(score)} will be
selected. Change the value to overwrite. Only used when binary
categorical model.}

\item{balance}{Boolean. Auto-balance train dataset with under-sampling?}

\item{impute}{Boolean. Fill \code{NA} values with MICE?}

\item{no_outliers}{Boolean/Numeric. Remove \code{y}'s outliers from the dataset?
Will remove those values that are farther than n standard deviations from
the independent variable's mean (Z-score). Set to \code{TRUE} for default (3)
or numeric to set a different multiplier.}

\item{unique_train}{Boolean. Keep only unique row observations for training data?}

\item{center}{Boolean. Using the base function scale, do you wish
to center and/or scale all numerical values?}

\item{scale}{Boolean. Using the base function scale, do you wish
to center and/or scale all numerical values?}

\item{thresh}{Integer. Threshold for selecting binary or regression 
models: this number is the threshold of unique values we should 
have in \code{'tag'} (more than: regression; less than: classification)}

\item{seed}{Integer. Set a seed for reproducibility. AutoML can only 
guarantee reproducibility if max_models is used because max_time is 
resource limited.}

\item{quiet}{Boolean. Quiet all messages, warnings, recommendations?}
}
\value{
List. Contains original data.frame \code{df}, an index
to identify which observations with be part of the train dataset
\code{train_index}, and which model type should be \code{model_type}.
}
\description{
Pre-process your data before training a model. This is the prior step
on the \code{h2o_automl()} function's pipeline. Enabling for 
other use cases when wanting too  use any other framework, library, 
or custom algorithm.
}
\examples{
data(dft) # Titanic dataset

model_preprocess(dft, "Survived", balance = TRUE)

model_preprocess(dft, "Fare", split = 0.5, scale = TRUE)

model_preprocess(dft, "Pclass", ignore = c("Fare", "Cabin"))

model_preprocess(dft, "Pclass", quiet = TRUE)
}
\seealso{
Other Machine Learning: 
\code{\link{ROC}()},
\code{\link{conf_mat}()},
\code{\link{export_results}()},
\code{\link{gain_lift}()},
\code{\link{h2o_automl}()},
\code{\link{h2o_predict_API}()},
\code{\link{h2o_predict_MOJO}()},
\code{\link{h2o_predict_binary}()},
\code{\link{h2o_predict_model}()},
\code{\link{h2o_selectmodel}()},
\code{\link{impute}()},
\code{\link{iter_seeds}()},
\code{\link{lasso_vars}()},
\code{\link{model_metrics}()},
\code{\link{msplit}()}
}
\concept{Machine Learning}
