library("quanteda")
## Package version: 4.2.0
## Unicode version: 15.1
## ICU version: 74.2
## Parallel computing: disabled
## See https://quanteda.io for tutorials and examples.
library("quanteda.textmodels")
quanteda.textmodels implements fast methods for fitting and predicting Naive Bayes textmodels built especially for sparse document-feature matrices from textual data. It implements two models: multinomial and Bernoulli. (See Manning, Raghavan, and Schütze 2008, Chapter 13.)
Here, we compare performance for the two models, and then to the performance from two other packages for fitting these models.
For these tests, we will choose the dataset of 50,000 movie reviews from Maas et. al. (2011). We will use their partition into test and training sets for training and fitting our models.
# large movie review database of 50,000 movie reviews
load(url("https://quanteda.org/data/data_corpus_LMRD.rda"))
dfmat <- tokens(data_corpus_LMRD) %>%
dfm()
dfmat_train <- dfm_subset(dfmat, set == "train")
dfmat_test <- dfm_subset(dfmat, set == "test")
Comparing the performance of fitting the model:
library("microbenchmark")
microbenchmark(
multi = textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "multinomial"),
bern = textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "Bernoulli"),
times = 20
)
## Unit: milliseconds
## expr min lq mean median uq max neval
## multi 80.34582 80.49499 87.56626 81.17842 95.06311 103.8359 20
## bern 88.23146 101.98395 108.42785 102.36016 103.33094 263.6680 20
And for prediction:
microbenchmark(
multi = predict(textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "multinomial"),
newdata = dfmat_test),
bern = predict(textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "Bernoulli"),
newdata = dfmat_test),
times = 20
)
## Unit: milliseconds
## expr min lq mean median uq max neval
## multi 90.74203 91.47486 104.6643 92.3188 105.2958 250.3268 20
## bern 120.56544 134.02825 135.7083 134.3810 136.0636 148.6927 20
Now let’s see how textmodel_nb()
compares to equivalent
functions from other packages. Multinomial:
library("fastNaiveBayes")
library("naivebayes")
## naivebayes 1.0.0 loaded
## For more information please visit:
## https://majkamichal.github.io/naivebayes/
microbenchmark(
textmodels = {
tmod <- textmodel_nb(dfmat_train, dfmat_train$polarity, smooth = 1, distribution = "multinomial")
pred <- predict(tmod, newdata = dfmat_test)
},
fastNaiveBayes = {
tmod <- fnb.multinomial(as(dfmat_train, "dgCMatrix"), y = dfmat_train$polarity, laplace = 1, sparse = TRUE)
pred <- predict(tmod, newdata = as(dfmat_test, "dgCMatrix"))
},
naivebayes = {
tmod = multinomial_naive_bayes(as(dfmat_train, "dgCMatrix"), dfmat_train$polarity, laplace = 1)
pred <- predict(tmod, newdata = as(dfmat_test, "dgCMatrix"))
},
times = 20
)
## Unit: milliseconds
## expr min lq mean median uq max neval
## textmodels 90.88102 91.27254 104.8551 91.82568 105.7646 264.2343 20
## fastNaiveBayes 147.78668 153.25063 165.6141 156.69513 158.8421 320.9213 20
## naivebayes 108.30317 112.03012 122.8543 123.64967 128.2120 160.8544 20
And Bernoulli. Note here that while we are supplying the Boolean
matrix to textmodel_nb()
, this re-weighting from the count
matrix would have been performed automatically within the function had
we not done so in advance - it’s done here just for comparison.
dfmat_train_bern <- dfm_weight(dfmat_train, scheme = "boolean")
dfmat_test_bern <- dfm_weight(dfmat_test, scheme = "boolean")
microbenchmark(
textmodel_nb = {
tmod <- textmodel_nb(dfmat_train_bern, dfmat_train$polarity, smooth = 1, distribution = "Bernoulli")
pred <- predict(tmod, newdata = dfmat_test)
},
fastNaiveBayes = {
tmod <- fnb.bernoulli(as(dfmat_train_bern, "dgCMatrix"), y = dfmat_train$polarity, laplace = 1, sparse = TRUE)
pred <- predict(tmod, newdata = as(dfmat_test_bern, "dgCMatrix"))
},
naivebayes = {
tmod = bernoulli_naive_bayes(as(dfmat_train_bern, "dgCMatrix"), dfmat_train$polarity, laplace = 1)
pred <- predict(tmod, newdata = as(dfmat_test_bern, "dgCMatrix"))
},
times = 20
)
## Unit: milliseconds
## expr min lq mean median uq max neval
## textmodel_nb 120.3401 134.2671 146.1340 135.4212 149.1032 294.4838 20
## fastNaiveBayes 155.5763 162.0864 169.8086 164.7229 173.5846 206.2223 20
## naivebayes 120.3778 123.3058 138.6056 133.1250 139.1484 284.8761 20
Maas, Andrew L., Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng, and Christopher Potts (2011). “Learning Word Vectors for Sentiment Analysis”. The 49th Annual Meeting of the Association for Computational Linguistics (ACL 2011).
Majka M (2020). naivebayes: High Performance Implementation of the Naive Bayes Algorithm in R. R package version 0.9.7, <URL: https://CRAN.R-project.org/package=naivebayes>. Date: 2020-03-08.
Manning, Christopher D., Prabhakar Raghavan, and Hinrich Schütze (2008). Introduction to Information Retrieval. Cambridge University Press.
Skogholt, Martin (2020). fastNaiveBayes: Extremely Fast Implementation of a Naive Bayes Classifier. R package version 2.2.1. https://github.com/mskogholt/fastNaiveBayes. Date: 2020-05-04.