#' Read a CSV file exported from Qualtrics
#'
#' Reads comma separated CSV files generated by Qualtrics
#' software. The second line containing the variable labels is imported.
#' Repetitive introductions to matrix questions are automatically removed.
#' Variable labels are stored as attributes.
#'
#' @param file_name String. A CSV data file.
#' @param import_id Logical. If \code{TRUE}, use Qualtrics import IDs instead of
#' question IDs as column names. Defaults to \code{FALSE}.
#' @param strip_html Logical. If \code{TRUE}, then remove HTML tags. Defaults
#' to \code{TRUE}.
#' @param time_zone String. A local timezone to determine response date
#' values. Defaults to \code{NULL} which corresponds to UTC time. See
#' \url{https://api.qualtrics.com/docs/time-zones} for more information on
#' format.
#' @param legacy Logical. If \code{TRUE}, then import "legacy" format CSV files
#' (as of 2017). Defaults to \code{FALSE}.
#' @param col_types Optional. This argument provides a way to manually overwrite
#' column types that may be incorrectly guessed. Takes a \code{\link[readr]{cols}}
#' specification. See example below and \code{\link[readr]{cols}} for formatting
#' details. Defaults to \code{NULL}.
#'
#' @importFrom sjlabelled set_label
#' @importFrom jsonlite fromJSON
#' @importFrom stringr str_match
#' @importFrom readr read_csv
#' @return A data frame. Variable labels are stored as attributes. They are not
#' printed on the console but are visibile in the RStudio viewer.
#' @export
#' @examples
#' \dontrun{
#' # Generic use of read_survey()
#' df <- read_survey("<YOUR-PATH-TO-CSV-FILE>")
#' }
#' # Example using current data format
#' file <- system.file("extdata", "sample.csv", package = "qualtRics")
#' df <- read_survey(file)
#'
#' # Example using legacy data format
#' file <- system.file("extdata", "sample_legacy.csv", package = "qualtRics")
#' df <- read_survey(file, legacy = TRUE)
#'
#' # Example changing column type
#' file <- system.file("extdata", "sample.csv", package = "qualtRics")
#' # Force EndDate to be a string
#' df <- read_survey(file, col_types = readr::cols(EndDate = readr::col_character()))
#'
read_survey <- function(file_name,
                        strip_html = TRUE,
                        import_id = FALSE,
                        time_zone = NULL,
                        legacy = FALSE,
                        col_types = NULL) {

  # START UP: CHECK ARGUMENTS PASSED BY USER ----

  if (import_id & legacy) {
    stop("Import IDs as column names are not supported for legacy CSVs.\nSet import_id = FALSE.",
         call. = FALSE)
  }

  # check if file exists
  assert_surveyFile_exists(file_name)
  # skip 2 rows if legacy format, else 3 when loading the data
  skipNr <- ifelse(legacy, 2, 3)

  # Set time_zone to UTC if left unspecified
  if(is.null(time_zone)){
    time_zone <- "UTC"
  }

  # READ DATA ----

  # import data including variable names (row 1) and variable labels (row 2)
  rawdata <- suppressMessages(readr::read_csv(
    file = file_name,
    col_names = FALSE,
    col_types = readr::cols(.default = readr::col_character()),
    skip = skipNr,
    na = c("")
  ))
  # Load headers
  header <- suppressWarnings(suppressMessages(readr::read_csv(
    file = file_name,
    col_names = TRUE,
    col_types = readr::cols(.default = readr::col_character()),
    n_max = 1
  )))

  # Message for no data in survey
  if (nrow(rawdata) < 1) {
    message("The survey you are importing has no responses.")
    tbl <- tibble::as_tibble(matrix(nrow = 0,
                                    ncol = length(names(header))),
                             .name_repair = "minimal")
    colnames(tbl) <- names(header)
    tbl <- dplyr::mutate_all(tbl, as.character)
    return(tbl)
  }

  # MANIPULATE DATA ----

  # make them data.frame's, else the factor conversion
  # in `infer_data_types` crashes
  # rawdata <- as.data.frame(rawdata)
  # header <- as.data.frame(header)
  # Add names
  names(rawdata) <- names(header)

  if (import_id) {
    new_ids <- suppressMessages(readr::read_csv(
      file = file_name,
      col_names = FALSE,
      col_types = readr::cols(.default = readr::col_character()),
      skip = skipNr - 1,
      n_max = 1
    ))

    names(rawdata) <- jsonlite::fromJSON(
      paste0('[', paste(as.character(unlist(new_ids)), collapse = ','), ']')
    )$ImportId
  }

  # If Qualtrics adds an empty column at the end, remove it
  if (grepl(",$", readLines(file_name, n = 1))) {
    header <- header[, 1:(ncol(header) - 1)]
    rawdata <- rawdata[, 1:(ncol(rawdata) - 1)]
  }
  # extract second row, remove it from df
  secondrow <- unlist(header)
  row.names(rawdata) <- NULL

  # Clean variable labels
  if (strip_html) {
    secondrow <- remove_html(secondrow)
  }

  # Scale Question with subquestion:
  # If it matches one of ".?!" followed by "-", take subsequent part
  subquestions <- stringr::str_match(secondrow, ".*[:punct:]\\s*-(.*)")[, 2]

  # Else if subquestion returns NA, use whole string
  subquestions[is.na(subquestions)] <- unlist(secondrow[is.na(subquestions)])

  # Remaining NAs default to 'empty string'
  subquestions[is.na(subquestions)] <- ""

  rawdata <- readr::type_convert(rawdata,
                                 locale = readr::locale(tz = time_zone),
                                 col_types = col_types)

  # Add labels to data
  rawdata <- sjlabelled::set_label(rawdata, unlist(subquestions))

  # RETURN ----

  return(rawdata)
}
