% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/nlp_phrase_sequences.R
\name{keywords_phrases}
\alias{keywords_phrases}
\alias{phrases}
\alias{phrases}
\title{Extract phrases - a sequence of terms which follow each other based on a sequence of Parts of Speech tags}
\usage{
keywords_phrases(x, term = x, pattern, is_regex = FALSE, sep = " ",
  ngram_max = 8, detailed = TRUE)

phrases(x, term = x, pattern, is_regex = FALSE, sep = " ",
  ngram_max = 8, detailed = TRUE)
}
\arguments{
\item{x}{a character vector of Parts of Speech tags where we want to locate a relevant sequence of POS tags as defined in \code{pattern}}

\item{term}{a character vector of the same length as \code{x} with the words or terms corresponding to the tags in \code{x}}

\item{pattern}{In case \code{is_regex} is set to FALSE, \code{pattern} should be a character vector with a sequence of POS tags 
to identify in \code{x}. The length of the character vector should be bigger than 1.\cr
In case \code{is_regex} is set to TRUE, this should be a regular expressions which will be used on a concatenated version 
of \code{x} to identify the locations where these regular expression occur. See the examples below.}

\item{is_regex}{logical indicating if \code{pattern} can be considered as a regular expression or if it is just
a character vector of POS tags. Defaults to FALSE, indicating \code{pattern} is not a regular expression.}

\item{sep}{character indicating how to collapse the phrase of terms which are found. Defaults to using a space.}

\item{ngram_max}{an integer indicating to allow phrases to be found up to \code{ngram} maximum number of terms following each other. Only 
used if is_regex is set to TRUE. Defaults to 8.}

\item{detailed}{logical indicating to return the exact positions where the phrase was found (set to \code{TRUE}) or just how many times each phrase is occurring (set to \code{FALSE}). 
Defaults to \code{TRUE}.}
}
\value{
If argument \code{detailed} is set to \code{TRUE} a data.frame with columns 
\itemize{
\item keyword: the phrase which corresponds to the collapsed terms of where the pattern was found
\item ngram: the length of the phrase
\item pattern: the pattern which was found
\item start: the starting index of \code{x} where the pattern was found
\item end: the ending index of \code{x} where the pattern was found
}
If argument \code{detailed} is set to \code{FALSE} will return aggregate frequency statistics in a data.frame containing the columns keyword, 
ngram and freq (how many time it is occurring)
}
\description{
This function allows to extract phrases, like simple noun phrases, complex noun phrases
or any exact sequence of parts of speech tag patterns.\cr
An example use case of this is to get all text where an adjective is followed by a noun or
for example to get all phrases consisting of a preposition which is followed by a noun which is next followed by a verb.
More complex patterns are shown in the details below.
}
\details{
Common phrases which you might be interested in and which can be supplied to \code{pattern} are
\itemize{
\item Simple noun phrase: "(A|N)*N(P+D*(A|N)*N)*"
\item Simple verb Phrase: "((A|N)*N(P+D*(A|N)*N)*P*(M|V)*V(M|V)*|(M|V)*V(M|V)*D*(A|N)*N(P+D*(A|N)*N)*|(M|V)*V(M|V)*(P+D*(A|N)*N)+|(A|N)*N(P+D*(A|N)*N)*P*((M|V)*V(M|V)*D*(A|N)*N(P+D*(A|N)*N)*|(M|V)*V(M|V)*(P+D*(A|N)*N)+))"
\item Noun hrase with coordination conjuction: "((A(CA)*|N)*N((P(CP)*)+(D(CD)*)*(A(CA)*|N)*N)*(C(D(CD)*)*(A(CA)*|N)*N((P(CP)*)+(D(CD)*)*(A(CA)*|N)*N)*)*)"
\item Verb phrase with coordination conjuction: "(((A(CA)*|N)*N((P(CP)*)+(D(CD)*)*(A(CA)*|N)*N)*(C(D(CD)*)*(A(CA)*|N)*N((P(CP)*)+(D(CD)*)*(A(CA)*|N)*N)*)*)(P(CP)*)*(M(CM)*|V)*V(M(CM)*|V)*(C(M(CM)*|V)*V(M(CM)*|V)*)*|(M(CM)*|V)*V(M(CM)*|V)*(C(M(CM)*|V)*V(M(CM)*|V)*)*(D(CD)*)*((A(CA)*|N)*N((P(CP)*)+(D(CD)*)*(A(CA)*|N)*N)*(C(D(CD)*)*(A(CA)*|N)*N((P(CP)*)+(D(CD)*)*(A(CA)*|N)*N)*)*)|(M(CM)*|V)*V(M(CM)*|V)*(C(M(CM)*|V)*V(M(CM)*|V)*)*((P(CP)*)+(D(CD)*)*(A(CA)*|N)*N)+|((A(CA)*|N)*N((P(CP)*)+(D(CD)*)*(A(CA)*|N)*N)*(C(D(CD)*)*(A(CA)*|N)*N((P(CP)*)+(D(CD)*)*(A(CA)*|N)*N)*)*)(P(CP)*)*((M(CM)*|V)*V(M(CM)*|V)*(C(M(CM)*|V)*V(M(CM)*|V)*)*(D(CD)*)*((A(CA)*|N)*N((P(CP)*)+(D(CD)*)*(A(CA)*|N)*N)*(C(D(CD)*)*(A(CA)*|N)*N((P(CP)*)+(D(CD)*)*(A(CA)*|N)*N)*)*)|(M(CM)*|V)*V(M(CM)*|V)*(C(M(CM)*|V)*V(M(CM)*|V)*)*((P(CP)*)+(D(CD)*)*(A(CA)*|N)*N)+))"
}
See the examples.\cr
Mark that this functionality is also implemented in the phrasemachine package where it is implemented using plain R code, 
while the implementation in this package uses a more quick Rcpp implementation for 
extracting these kind of regular expression like phrases.
}
\examples{
data(brussels_reviews_anno, package = "udpipe")
x <- subset(brussels_reviews_anno, language \%in\% "fr")

## Find exactly this sequence of POS tags
np <- keywords_phrases(x$xpos, pattern = c("DT", "NN", "VB", "RB", "JJ"), sep = "-")
head(np)
np <- keywords_phrases(x$xpos, pattern = c("DT", "NN", "VB", "RB", "JJ"), term = x$token)
head(np)

## Find noun phrases with the following regular expression: (A|N)+N(P+D*(A|N)*N)*
x$phrase_tag <- as_phrasemachine(x$xpos, type = "penn-treebank")
nounphrases <- keywords_phrases(x$phrase_tag, term = x$token, 
                                pattern = "(A|N)+N(P+D*(A|N)*N)*", is_regex = TRUE, 
                                ngram_max = 4, 
                                detailed = TRUE)
head(nounphrases, 10)
head(sort(table(nounphrases$keyword), decreasing=TRUE), 20)

## Find frequent sequences of POS tags
library(data.table)
x <- as.data.table(x)
x <- x[, pos_sequence := txt_nextgram(x = xpos, n = 3), by = list(doc_id, sentence_id)]
tail(sort(table(x$pos_sequence)))
np <- keywords_phrases(x$xpos, term = x$token, pattern = c("IN", "DT", "NN"))
head(np)
}
\seealso{
\code{\link{as_phrasemachine}}
}
