% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/cc_outl.R
\name{cc_outl}
\alias{cc_outl}
\title{Identify Geographic Outliers in Species Distributions}
\usage{
cc_outl(x, lon = "decimallongitude", lat = "decimallatitude",
  species = "species", method = "quantile", mltpl = 5, tdi = 1000,
  value = "clean", sampling_thresh = 0, verbose = TRUE,
  min_occs = 7, thinning = FALSE, thinning_res = 0.5)
}
\arguments{
\item{x}{data.frame. Containing geographical coordinates and species
names.}

\item{lon}{character string. The column with the longitude coordinates.
Default = \dQuote{decimallongitude}.}

\item{lat}{character string. The column with the latitude coordinates.
Default = \dQuote{decimallatitude}.}

\item{species}{character string. The column with the species name. Default
= \dQuote{species}.}

\item{method}{character string.  Defining the method for outlier
selection.  See details. One of \dQuote{distance}, \dQuote{quantile},
\dQuote{mad}.  Default = \dQuote{quantile}.}

\item{mltpl}{numeric. The multiplier of the interquartile range
(\code{method == 'quantile'}) or median absolute deviation (\code{method ==
'mad'})to identify outliers. See details.  Default = 5.}

\item{tdi}{numeric.  The minimum absolute distance (\code{method ==
'distance'}) of a record to all other records of a species to be identified
as outlier, in km. See details. Default = 1000.}

\item{value}{character string.  Defining the output value. See value.}

\item{sampling_thresh}{numeric. Cut off threshold for the sampling correction.
Indicates the quantile of sampling in which outliers should be ignored. For instance, 
if \code{sampling_thresh} == 0.25, records in the 25% worst sampled countries will 
not be flagged as outliers. Default = 0 (no sampling correction).}

\item{verbose}{logical. If TRUE reports the name of the test and the number
of records flagged.}

\item{min_occs}{Minimum number of geographically unique datapoints needed for a species to be tested. 
This is necessary for reliable outlier estimation.
Species wit less than min_occs records will not be tested and the output value will be 'TRUE'.
Default is to 7. If \code{method == 'distance'}, consider a lower threshold.}

\item{thinning}{forces a raster approximation for the distance calculation. 
This is routinely used for species with more than 10,000 records for computational reasons, 
but can be enforced for smaller datasets, which is reommended when sampling is very uneven. Default = T.}

\item{thinning_res}{The resolution for the spatial thinning in decimal degrees. Default = 0.5.}
}
\value{
Depending on the \sQuote{value} argument, either a \code{data.frame}
containing the records considered correct by the test (\dQuote{clean}) or a
logical vector (\dQuote{flagged}), with TRUE = test passed and FALSE = test failed/potentially
problematic . Default = \dQuote{clean}.
}
\description{
Removes out or flags records that are outliers in geographic space according to the method
defined via the \code{method} argument. Geographic outliers often represent
erroneous coordinates, for example due to data entry errors, imprecise
geo-references, individuals in horticulture/captivity.
}
\details{
The method for outlier identification depends on the \code{method} argument.
If \dQuote{outlier}: a boxplot method is used and records are flagged as
outliers if their \emph{mean} distance to all other records of the same
species is larger than mltpl * the interquartile range of the mean distance
of all records of this species. If \dQuote{mad}: the median absolute
deviation is used. In this case a record is flagged as outlier, if the
\emph{mean} distance to all other records of the same species is larger than
the median of the mean distance of all points plus/minus the mad of the mean
distances of all records of the species * mltpl. If \dQuote{distance}:
records are flagged as outliers, if the \emph{minimum} distance to the next
record of the species is > \code{tdi}. For species with records from > 10000
unique locations a random sample of 1000 records is used for 
the distance matrix calculation. The test skipps species with less than \code{min_occs},
 geographically unique records.

The likelihood of occurrence records being erroneous outliers is linked to the sampling effort
in any given location. To account for this, the sampling_cor option fetches 
the number of occurrence records available 
from www.gbif.org, per country as a proxy of sampling effort. The outlier test 
(the mean distance) for each records is than weighted by the log transformed 
number of records per square kilometre in this country. 
See for \url{https://ropensci.github.io/CoordinateCleaner/articles/Tutorial_geographic_outliers.html} 
an example and further explanation of the outlier test.
}
\note{
See \url{https://ropensci.github.io/CoordinateCleaner/} for more
details and tutorials.
}
\examples{

x <- data.frame(species = letters[1:10], 
                decimallongitude = runif(100, -180, 180), 
                decimallatitude = runif(100, -90,90))
                
cc_outl(x)
cc_outl(x, method = "quantile", value = "flagged")
cc_outl(x, method = "distance", value = "flagged", tdi = 10000)
cc_outl(x, method = "distance", value = "flagged", tdi = 1000)

}
\seealso{
Other Coordinates: \code{\link{cc_cap}},
  \code{\link{cc_cen}}, \code{\link{cc_coun}},
  \code{\link{cc_dupl}}, \code{\link{cc_equ}},
  \code{\link{cc_gbif}}, \code{\link{cc_inst}},
  \code{\link{cc_iucn}}, \code{\link{cc_sea}},
  \code{\link{cc_urb}}, \code{\link{cc_val}},
  \code{\link{cc_zero}}
}
\concept{Coordinates}
\keyword{Coordinate}
\keyword{cleaning}
