library("quanteda")
## Package version: 4.1.0
## Unicode version: 15.1
## ICU version: 74.2
## Parallel computing: disabled
## See https://quanteda.io for tutorials and examples.
library("quanteda.textmodels")
quanteda.textmodels implements fast methods for fitting and predicting Naive Bayes textmodels built especially for sparse document-feature matrices from textual data. It implements two models: multinomial and Bernoulli. (See Manning, Raghavan, and Schütze 2008, Chapter 13.)
Here, we compare performance for the two models, and then to the performance from two other packages for fitting these models.
For these tests, we will choose the dataset of 50,000 movie reviews from Maas et. al. (2011). We will use their partition into test and training sets for training and fitting our models.
# large movie review database of 50,000 movie reviews
load(url("https://quanteda.org/data/data_corpus_LMRD.rda"))
dfmat <- tokens(data_corpus_LMRD) %>%
dfm()
dfmat_train <- dfm_subset(dfmat, set == "train")
dfmat_test <- dfm_subset(dfmat, set == "test")
Comparing the performance of fitting the model:
library("microbenchmark")
microbenchmark(
multi = textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "multinomial"),
bern = textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "Bernoulli"),
times = 20
)
## Unit: milliseconds
## expr min lq mean median uq max neval
## multi 80.07897 80.42267 95.37965 87.12521 95.8212 245.2989 20
## bern 87.36437 87.90223 105.51042 101.68813 102.6912 251.2851 20
And for prediction:
microbenchmark(
multi = predict(textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "multinomial"),
newdata = dfmat_test),
bern = predict(textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "Bernoulli"),
newdata = dfmat_test),
times = 20
)
## Unit: milliseconds
## expr min lq mean median uq max neval
## multi 90.14064 90.22442 103.5717 91.54499 103.7648 251.3834 20
## bern 118.04994 131.38469 133.3892 132.47871 138.0929 146.4360 20
Now let’s see how textmodel_nb()
compares to equivalent
functions from other packages. Multinomial:
library("fastNaiveBayes")
library("naivebayes")
## naivebayes 1.0.0 loaded
## For more information please visit:
## https://majkamichal.github.io/naivebayes/
microbenchmark(
textmodels = {
tmod <- textmodel_nb(dfmat_train, dfmat_train$polarity, smooth = 1, distribution = "multinomial")
pred <- predict(tmod, newdata = dfmat_test)
},
fastNaiveBayes = {
tmod <- fnb.multinomial(as(dfmat_train, "dgCMatrix"), y = dfmat_train$polarity, laplace = 1, sparse = TRUE)
pred <- predict(tmod, newdata = as(dfmat_test, "dgCMatrix"))
},
naivebayes = {
tmod = multinomial_naive_bayes(as(dfmat_train, "dgCMatrix"), dfmat_train$polarity, laplace = 1)
pred <- predict(tmod, newdata = as(dfmat_test, "dgCMatrix"))
},
times = 20
)
## Unit: milliseconds
## expr min lq mean median uq max neval
## textmodels 90.24134 91.12863 99.68362 104.4445 105.2673 107.5228 20
## fastNaiveBayes 129.69477 136.50142 143.39691 144.1438 150.5497 158.8182 20
## naivebayes 103.55443 106.16934 121.51672 114.7251 120.8235 266.5060 20
And Bernoulli. Note here that while we are supplying the Boolean
matrix to textmodel_nb()
, this re-weighting from the count
matrix would have been performed automatically within the function had
we not done so in advance - it’s done here just for comparison.
dfmat_train_bern <- dfm_weight(dfmat_train, scheme = "boolean")
dfmat_test_bern <- dfm_weight(dfmat_test, scheme = "boolean")
microbenchmark(
textmodel_nb = {
tmod <- textmodel_nb(dfmat_train_bern, dfmat_train$polarity, smooth = 1, distribution = "Bernoulli")
pred <- predict(tmod, newdata = dfmat_test)
},
fastNaiveBayes = {
tmod <- fnb.bernoulli(as(dfmat_train_bern, "dgCMatrix"), y = dfmat_train$polarity, laplace = 1, sparse = TRUE)
pred <- predict(tmod, newdata = as(dfmat_test_bern, "dgCMatrix"))
},
naivebayes = {
tmod = bernoulli_naive_bayes(as(dfmat_train_bern, "dgCMatrix"), dfmat_train$polarity, laplace = 1)
pred <- predict(tmod, newdata = as(dfmat_test_bern, "dgCMatrix"))
},
times = 20
)
## Unit: milliseconds
## expr min lq mean median uq max neval
## textmodel_nb 118.9946 133.3705 135.2605 134.4165 138.9679 148.2901 20
## fastNaiveBayes 142.1740 154.0823 171.3386 156.5314 158.1285 332.5916 20
## naivebayes 113.2747 113.7114 121.3939 120.6428 128.8479 131.3425 20
Maas, Andrew L., Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng, and Christopher Potts (2011). “Learning Word Vectors for Sentiment Analysis”. The 49th Annual Meeting of the Association for Computational Linguistics (ACL 2011).
Majka M (2020). naivebayes: High Performance Implementation of the Naive Bayes Algorithm in R. R package version 0.9.7, <URL: https://CRAN.R-project.org/package=naivebayes>. Date: 2020-03-08.
Manning, Christopher D., Prabhakar Raghavan, and Hinrich Schütze (2008). Introduction to Information Retrieval. Cambridge University Press.
Skogholt, Martin (2020). fastNaiveBayes: Extremely Fast Implementation of a Naive Bayes Classifier. R package version 2.2.1. https://github.com/mskogholt/fastNaiveBayes. Date: 2020-05-04.