% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/most_challenging.R
\name{most_challenging}
\alias{most_challenging}
\alias{hardest}
\title{Find the data points that were hardest to predict}
\usage{
most_challenging(
  data,
  type,
  obs_id_col = "Observation",
  target_col = "Target",
  prediction_cols = ifelse(type == "gaussian", "Prediction", "Predicted Class"),
  threshold = 0.15,
  threshold_is = "percentage",
  metric = NULL,
  cutoff = 0.5
)
}
\arguments{
\item{data}{\code{data.frame} with predictions, targets and observation IDs.
 Can be grouped by \code{\link[dplyr:group_by]{dplyr::group_by()}}.

 Predictions can be passed as values, predicted classes or predicted probabilities:

 \strong{N.B.} Adds \code{\link[base:.Machine]{.Machine$double.eps}} to all probabilities to avoid \code{log(0)}.

 \subsection{Multinomial}{
 When \code{`type`} is \code{"multinomial"}, the predictions can be passed in one of two formats.

 \subsection{Probabilities (Preferable)}{

 One column per class with the probability of that class.
 The columns should have the name of their class,
 as they are named in the target column. E.g.:

 \tabular{rrrrr}{
  \strong{class_1} \tab \strong{class_2} \tab
  \strong{class_3} \tab \strong{target}\cr
  0.269 \tab 0.528 \tab 0.203 \tab class_2\cr
  0.368 \tab 0.322 \tab 0.310 \tab class_3\cr
  0.375 \tab 0.371 \tab 0.254 \tab class_2\cr
  ... \tab ... \tab ... \tab ...}
 }
 \subsection{Classes}{

 A single column of type \code{character} with the predicted classes. E.g.:

 \tabular{rrrrr}{
  \strong{prediction} \tab \strong{target}\cr
  class_2 \tab class_2\cr
  class_1 \tab class_3\cr
  class_1 \tab class_2\cr
  ... \tab ...}

 }
 }
 \subsection{Binomial}{
 When \code{`type`} is \code{"binomial"}, the predictions can be passed in one of two formats.

 \subsection{Probabilities (Preferable)}{
 One column with the \strong{probability of class being
 the second class alphabetically}
 ("dog" if classes are "cat" and "dog"). E.g.:

 \tabular{rrrrr}{
  \strong{prediction} \tab \strong{target}\cr
  0.769 \tab "dog"\cr
  0.368 \tab "dog"\cr
  0.375 \tab "cat"\cr
  ... \tab ...}
 }

 Note: At the alphabetical ordering of the class labels, they are of type \code{character},
 why e.g. \code{100} would come before \code{7}.

 \subsection{Classes}{

 A single column of type \code{character} with the predicted classes. E.g.:

 \tabular{rrrrr}{
  \strong{prediction} \tab \strong{target}\cr
  class_0 \tab class_1\cr
  class_1 \tab class_1\cr
  class_1 \tab class_0\cr
  ... \tab ...}
 }
 }
 \subsection{Gaussian}{
 When \code{`type`} is \code{"gaussian"}, the predictions should be passed as
 one column with the predicted values. E.g.:

 \tabular{rrrrr}{
  \strong{prediction} \tab \strong{target}\cr
  28.9 \tab 30.2\cr
  33.2 \tab 27.1\cr
  23.4 \tab 21.3\cr
  ... \tab ...}
 }}

\item{type}{Type of task used to get the predictions:

 \code{"gaussian"} for regression (like linear regression).

 \code{"binomial"} for binary classification.

 \code{"multinomial"} for multiclass classification.}

\item{obs_id_col}{Name of column with observation IDs. This will be used to aggregate
the performance of each observation.}

\item{target_col}{Name of column with the true classes/values in \code{`data`}.}

\item{prediction_cols}{Name(s) of column(s) with the predictions.}

\item{threshold}{Threshold to filter observations by. Depends on \code{`type`} and \code{`threshold_is`}.

 The \code{threshold} can either be a \strong{percentage} or a \strong{score}.
 For percentages, a lower \code{threshold}
 returns fewer observations. For scores, this depends on \code{`type`}.

 \subsection{Gaussian}{
 \subsection{threshold_is "percentage"}{
 (Approximate) percentage of the observations with the largest root mean square errors
 to return.
 }
 \subsection{threshold_is "score"}{
 Observations with a root mean square error larger than or equal to the \code{threshold} will be returned.
 }
 }
 \subsection{Binomial, Multinomial}{
 \subsection{threshold_is "percentage"}{
 (Approximate) percentage of the observations to return with:

 \code{MAE}, \code{Cross Entropy}: Highest error scores.

 \code{Accuracy}: Lowest accuracies
 }
 \subsection{threshold_is "score"}{
 \code{MAE}, \code{Cross Entropy}: Observations with an error score above or equal to the threshold will be returned.

 \code{Accuracy}: Observations with an accuracy below or equal to the threshold will be returned.

 }
 }}

\item{threshold_is}{Either \code{"score"} or \code{"percentage"}. See \code{`threshold`}.}

\item{metric}{The metric to use. If \code{NULL},
 the default metric depends on the format of the prediction columns.

 \subsection{Binomial, Multinomial}{
 \code{"Accuracy"}, \code{"MAE"} or \code{"Cross Entropy"}.

 When \emph{one} prediction column with predicted \emph{classes} is passed,
 the default is \code{"Accuracy"}.
 In this configuration, the other metrics are not calculated.

 When \emph{one or more} prediction columns with predicted \emph{probabilities} are passed,
 the default is \code{"MAE"}. This is the Mean Absolute Error of the
 probability of the target class.
 }

 \subsection{Gaussian}{
 Ignored. Always uses \code{"RMSE"}.
 }}

\item{cutoff}{Threshold for predicted classes. (Numeric)

 N.B. \strong{Binomial only}.}
}
\value{
\code{data.frame} with the most challenging observations and their metrics.

 \code{`>=` / `<=`} denotes the threshold as score.
}
\description{
\Sexpr[results=rd, stage=render]{lifecycle::badge("experimental")}
 Finds the data points that, overall, were the most challenging to predict,
 based on a prediction metric.
}
\examples{
\donttest{
# Attach packages
library(cvms)
library(dplyr)

##
## Multinomial
##

# Find the most challenging data points (per classifier)
# in the predicted.musicians dataset
# which resembles the "Predictions" tibble from the evaluation results

# Passing predicted probabilities
# Observations with 30\% highest MAE scores
most_challenging(
  predicted.musicians,
  obs_id_col = "ID",
  prediction_cols = c("A", "B", "C", "D"),
  type = "multinomial",
  threshold = 0.30
)

# Observations with 25\% highest Cross Entropy scores
most_challenging(
  predicted.musicians,
  obs_id_col = "ID",
  prediction_cols = c("A", "B", "C", "D"),
  type = "multinomial",
  threshold = 0.25,
  metric = "Cross Entropy"
)

# Passing predicted classes
# Observations with 30\% lowest Accuracy scores
most_challenging(
  predicted.musicians,
  obs_id_col = "ID",
  prediction_cols = "Predicted Class",
  type = "multinomial",
  threshold = 0.30
)

# The 40\% lowest-scoring on accuracy per classifier
predicted.musicians \%>\%
  dplyr::group_by(Classifier) \%>\%
  most_challenging(
    obs_id_col = "ID",
    prediction_cols = "Predicted Class",
    type = "multinomial",
    threshold = 0.40
  )

# Accuracy scores below 0.05
most_challenging(
  predicted.musicians,
  obs_id_col = "ID",
  type = "multinomial",
  threshold = 0.05,
  threshold_is = "score"
)

##
## Binomial
##

# Subset the predicted.musicians
binom_data <- predicted.musicians \%>\%
  dplyr::filter(Target \%in\% c("A","B")) \%>\%
  dplyr::rename(Prediction = B)

# Passing probabilities
# Observations with 30\% highest MAE
most_challenging(
  binom_data,
  obs_id_col = "ID",
  type = "binomial",
  prediction_cols = "Prediction",
  threshold = 0.30
)

# Observations with 30\% highest Cross Entropy
most_challenging(
  binom_data,
  obs_id_col = "ID",
  type = "binomial",
  prediction_cols = "Prediction",
  threshold = 0.30,
  metric = "Cross Entropy"
)

# Passing predicted classes
# Observations with 30\% lowest Accuracy scores
most_challenging(
  binom_data,
  obs_id_col = "ID",
  type = "binomial",
  prediction_cols = "Predicted Class",
  threshold = 0.30
)

##
## Gaussian
##

set.seed(1)

df <- data.frame(
  "Observation" = rep(1:10, n = 3),
  "Target" = rnorm(n = 30, mean = 25, sd = 5),
  "Prediction" = rnorm(n = 30, mean = 27, sd = 7)
)

# The 20\% highest RMSE scores
most_challenging(
  df,
  type = "gaussian",
  threshold = 0.2
)

# RMSE scores above 9
most_challenging(
  df,
  type = "gaussian",
  threshold = 9,
  threshold_is = "score"
)
}
}
\author{
Ludvig Renbo Olsen, \email{r-pkgs@ludvigolsen.dk}
}
