% Generated by roxygen2: do not edit by hand
% Please edit documentation in R/textmodel.R
\name{textmodel_lss}
\alias{textmodel_lss}
\alias{textmodel_lss.dfm}
\alias{textmodel_lss.fcm}
\title{Fit a Latent Semantic Scaling model}
\usage{
textmodel_lss(x, ...)

\method{textmodel_lss}{dfm}(
  x,
  seeds,
  terms = NULL,
  k = 300,
  slice = NULL,
  weight = "count",
  cache = FALSE,
  simil_method = "cosine",
  engine = c("RSpectra", "irlba", "rsvd"),
  auto_weight = FALSE,
  include_data = FALSE,
  verbose = FALSE,
  ...
)

\method{textmodel_lss}{fcm}(
  x,
  seeds,
  terms = NULL,
  w = 50,
  max_count = 10,
  weight = "count",
  cache = FALSE,
  simil_method = "cosine",
  engine = c("rsparse"),
  auto_weight = FALSE,
  verbose = FALSE,
  ...
)
}
\arguments{
\item{x}{a dfm or fcm created by \code{\link[quanteda:dfm]{quanteda::dfm()}} or \code{\link[quanteda:fcm]{quanteda::fcm()}}}

\item{...}{additional arguments passed to the underlying engine.}

\item{seeds}{a character vector or named numeric vector that contains seed
words. If seed words contain "*", they are interpreted as glob patterns.
See \link[quanteda:valuetype]{quanteda::valuetype}.}

\item{terms}{a character vector or named numeric vector that specify words
for which polarity scores will be computed; if a numeric vector, words' polarity
scores will be weighted accordingly; if \code{NULL}, all the features of
\code{\link[quanteda:dfm]{quanteda::dfm()}} or \code{\link[quanteda:fcm]{quanteda::fcm()}} will be used.}

\item{k}{the number of singular values requested to the SVD engine. Only used
when \code{x} is a \code{dfm}.}

\item{slice}{a number or indices of the components of word vectors used to
compute similarity; \code{slice < k} to further truncate word vectors; useful
for diagnosys and simulation.}

\item{weight}{weighting scheme passed to \code{\link[quanteda:dfm_weight]{quanteda::dfm_weight()}}. Ignored
when \code{engine} is "rsparse".}

\item{cache}{if \code{TRUE}, save result of SVD for next execution with identical
\code{x} and settings. Use the \code{base::options(lss_cache_dir)} to change the
location cache files to be save.}

\item{simil_method}{specifies method to compute similarity between features.
The value is passed to \code{\link[quanteda.textstats:textstat_simil]{quanteda.textstats::textstat_simil()}}, "cosine" is
used otherwise.}

\item{engine}{select the engine to factorize \code{x} to generate word vectors. Choose
from \code{\link[RSpectra:svds]{RSpectra::svds()}}, \code{\link[irlba:irlba]{irlba::irlba()}}, \code{\link[rsvd:rsvd]{rsvd::rsvd()}}, and
\code{\link[rsparse:GloVe]{rsparse::GloVe()}}.}

\item{auto_weight}{automatically determine weights to approximate the
polarity of terms to seed words. See details.}

\item{include_data}{if \code{TRUE}, fitted model include the dfm supplied as \code{x}.}

\item{verbose}{show messages if \code{TRUE}.}

\item{w}{the size of word vectors. Used only when \code{x} is a \code{fcm}.}

\item{max_count}{passed to \code{x_max} in \code{rsparse::GloVe$new()} where cooccurrence
counts are ceiled to this threshold. It should be changed according to the
size of the corpus. Used only when \code{x} is a \code{fcm}.}
}
\description{
Latent Semantic Scaling (LSS) is a word embedding-based semisupervised algorithm
for document scaling.
}
\details{
Latent Semantic Scaling (LSS) is a semisupervised document scaling
method. \code{textmodel_lss()} constructs word vectors from use-provided
documents (\code{x}) and weights words (\code{terms}) based on their semantic
proximity to seed words (\code{seeds}). Seed words are any known polarity words
(e.g. sentiment words) that users should manually choose. The required
number of seed words are usually 5 to 10 for each end of the scale.

If \code{seeds} is a named numeric vector with positive and negative values, a
bipolar LSS model is construct; if \code{seeds} is a character vector, a
unipolar LSS model. Usually bipolar models perform better in document
scaling because both ends of the scale are defined by the user.

A seed word's polarity score computed by \code{textmodel_lss()} tends to diverge
from its original score given by the user because it's score is affected
not only by its original score but also by the original scores of all other
seed words. If \code{auto_weight = TRUE}, the original scores are weighted
automatically using \code{\link[stats:optim]{stats::optim()}} to minimize the squared difference
between seed words' computed and original scores. Weighted scores are saved
in \code{seed_weighted} in the object.
}
\examples{
\donttest{
library("quanteda")
library("LSX")

# download corpus
corp <- tryCatch({
   con <- url("https://bit.ly/2GZwLcN", "rb")
   readRDS(con)
   },
   error = function(e) e,
   warning = function(w) w,
   finally = close(con)
)

if (!exists("corp"))
   quit("no")

toks <- corpus_reshape(corp, "sentences") \%>\%
        tokens(remove_punct = TRUE) \%>\%
        tokens_remove(stopwords("en")) \%>\%
        tokens_select("^[\\\\p{L}]+$", valuetype = "regex", padding = TRUE)
dfmt <- dfm(toks) \%>\%
        dfm_trim(min_termfreq = 10)

seed <- as.seedwords(data_dictionary_sentiment)

# SVD
lss_svd <- textmodel_lss(dfmt, seed)
head(coef(lss_svd), 20)
head(predict(lss_svd, newdata = dfmt))
head(predict(lss_svd, newdata = dfmt, min_n = 10)) # more robust

dfmt_grp <- dfm_group(dfmt) # group sentences

# sentiment on economy
eco <- head(textstat_context(toks, 'econom*'), 500)
lss_svd_eco <- textmodel_lss(dfmt, seed, terms = eco)
head(predict(lss_svd_eco, newdata = dfmt_grp))

# sentiment on politics
pol <- head(textstat_context(toks, 'politi*'), 500)
lss_svd_pol <- textmodel_lss(dfmt, seed, terms = pol)
head(predict(lss_svd_pol, newdata = dfmt_grp))

# modify hyper-parameters of existing model
lss_svd_pol2 <- as.textmodel_lss(lss_svd_pol, seed[c(1, 8)], terms = pol, slice = 200)
head(predict(lss_svd_pol2, newdata = dfmt_grp))

# GloVe
fcmt <- fcm(dfmt, tri = FALSE)
lss_glov <- textmodel_lss(fcmt, seed)
head(predict(lss_glov, newdata = dfmt_grp))
}

}
\references{
Watanabe, Kohei. 2020. "Latent Semantic Scaling: A Semisupervised
Text Analysis Technique for New Domains and Languages", Communication
Methods and Measures. \doi{10.1080/19312458.2020.1832976}.

Watanabe, Kohei. 2017. "Measuring News Bias: Russia's Official News Agency
ITAR-TASS' Coverage of the Ukraine Crisis" European Journal of
Communication. \doi{10.1177/0267323117695735}.
}
