#' Perform DBSCAN Clustering on a Phylogenetic Tree
#'
#' This function applies the DBSCAN clustering algorithm on a set of protein sequences
#' to identify clusters and remove outliers based on a distance cutoff.
#'
#' @importFrom stats as.dist
#' @param input_obj A `AAStringSet` object containing protein sequences.
#' @param cutoff A numeric value specifying the distance cutoff for clustering.
#' @param nmin An integer specifying the minimum number of points required to form a cluster (DBSCAN parameter).
#' @return This function returns a `AAStringSet` object containing protein sequences with outliers removed. 
#' @details The function uses the DBSCAN algorithm to cluster sequences based on their phylogenetic distances. 
#' Sequences identified as outliers are excluded from the final output.
#' @examples
#' # Example usage:
#' library(Biostrings)
#'
#' # Create an AAStringSet object with the sequences
#' seqs <- AAStringSet(c(
#'   seq1 = "MKTIIALSYIFCLVFADYKDDDDK",
#'   seq2 = "MKTIIALSYIFCLVFADYKDLLKDDDD",
#'   seq3 = "MKTIIALSYIFCLVFADEELYKDDDD",
#'   seq4 = "MKTIEIALSYIFCLVFADYKDDDD",
#'   seq5 = "MKTIIKLAAASYIFCLVFADYKDDDD",
#'   seq6 = "MKTIIALSKIPFCLVFADYKDDDD",
#'   seq7 = "MKTIIALSYIFiQEERTCLVFADYKDDDD"
#' ))
#'
#' # Perform DBSCAN clustering and remove outliers
#' no_outliers <- phyl_tree_cluster_dbscan(seqs, cutoff = 0.5, nmin = 5)
#' @export

phyl_tree_cluster_dbscan <- function(input_obj, cutoff, nmin) {

      
  # Read the FASTA file
  seqs <- input_obj
  #Biostrings::readAAStringSet(input_file)
  # Check if sequences were read successfully
  if (length(seqs) == 0) {
    stop("No sequences were found in the input file. Please check the file format and content.")
  }

  # Validate that all sequences contain only valid amino acid characters (including gaps)
  valid_amino_acids <- "^[ACDEFGHIKLMNPQRSTVWY-]+$"
  invalid_sequences <- sapply(seqs, function(seq) !grepl(valid_amino_acids, as.character(seq)))

  if (any(invalid_sequences)) {
    stop("The input file contains invalid sequences. Please ensure all sequences are valid amino acid sequences (including gaps).")
  }

  # Check if there is a "-" in at least one of the sequences
  contains_hyphen <- any(sapply(seqs, function(seq) grepl("-", as.character(seq))))
  # Print the result
  if (!contains_hyphen) {
     alignment <- DECIPHER::AlignSeqs(seqs)
  }else
  {alignment <-seqs}
  if (length(alignment) < 2) {
  stop("The alignment must contain at least 2 sequences to calculate a distance matrix.")
  }
  # Check that all sequences in the alignment are the same length
  
  sequence_lengths <- Biostrings::width(alignment)
  if (length(unique(sequence_lengths)) > 1) {
    stop("Not all sequences in the alignment are the same length.")
  }


  D <- DECIPHER::DistanceMatrix(alignment)
  class(alignment) <- "alignment"
  # Perform DBSCAN clustering
  idx <- dbscan::dbscan(as.dist(D), eps = cutoff, minPts = nmin)
  # Identify non-outliers (cluster > 0)
  non_outliers <- seqs[idx$cluster > 0]

  # Return the non-outlier sequences as an AAStringSet object
  return(non_outliers)
}

