textmodel Performance Comparisons

library("quanteda")
## Package version: 4.1.0
## Unicode version: 15.1
## ICU version: 74.2
## Parallel computing: disabled
## See https://quanteda.io for tutorials and examples.
library("quanteda.textmodels")

Naive Bayes

quanteda.textmodels implements fast methods for fitting and predicting Naive Bayes textmodels built especially for sparse document-feature matrices from textual data. It implements two models: multinomial and Bernoulli. (See Manning, Raghavan, and Schütze 2008, Chapter 13.)

Here, we compare performance for the two models, and then to the performance from two other packages for fitting these models.

For these tests, we will choose the dataset of 50,000 movie reviews from Maas et. al. (2011). We will use their partition into test and training sets for training and fitting our models.

# large movie review database of 50,000 movie reviews
load(url("https://quanteda.org/data/data_corpus_LMRD.rda"))

dfmat <- tokens(data_corpus_LMRD) %>%
  dfm()
dfmat_train <- dfm_subset(dfmat, set == "train")
dfmat_test <- dfm_subset(dfmat, set == "test")

Comparing the performance of fitting the model:

library("microbenchmark")
microbenchmark(
    multi = textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "multinomial"),
    bern = textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "Bernoulli"),
    times = 20
)
## Unit: milliseconds
##   expr      min       lq      mean    median       uq      max neval
##  multi 80.07897 80.42267  95.37965  87.12521  95.8212 245.2989    20
##   bern 87.36437 87.90223 105.51042 101.68813 102.6912 251.2851    20

And for prediction:

microbenchmark(
    multi = predict(textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "multinomial"),
                    newdata = dfmat_test),
    bern = predict(textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "Bernoulli"),
                   newdata = dfmat_test),
    times = 20
)
## Unit: milliseconds
##   expr       min        lq     mean    median       uq      max neval
##  multi  90.14064  90.22442 103.5717  91.54499 103.7648 251.3834    20
##   bern 118.04994 131.38469 133.3892 132.47871 138.0929 146.4360    20

Now let’s see how textmodel_nb() compares to equivalent functions from other packages. Multinomial:

library("fastNaiveBayes")
library("naivebayes")
## naivebayes 1.0.0 loaded
## For more information please visit:
## https://majkamichal.github.io/naivebayes/

microbenchmark(
    textmodels = {
      tmod <-  textmodel_nb(dfmat_train, dfmat_train$polarity, smooth = 1, distribution = "multinomial")
      pred <- predict(tmod, newdata = dfmat_test)
    },
    fastNaiveBayes = { 
      tmod <- fnb.multinomial(as(dfmat_train, "dgCMatrix"), y = dfmat_train$polarity, laplace = 1, sparse = TRUE)
      pred <- predict(tmod, newdata = as(dfmat_test, "dgCMatrix"))
    },
    naivebayes = {
      tmod = multinomial_naive_bayes(as(dfmat_train, "dgCMatrix"), dfmat_train$polarity, laplace = 1)
      pred <- predict(tmod, newdata = as(dfmat_test, "dgCMatrix"))
    },
    times = 20
)
## Unit: milliseconds
##            expr       min        lq      mean   median       uq      max neval
##      textmodels  90.24134  91.12863  99.68362 104.4445 105.2673 107.5228    20
##  fastNaiveBayes 129.69477 136.50142 143.39691 144.1438 150.5497 158.8182    20
##      naivebayes 103.55443 106.16934 121.51672 114.7251 120.8235 266.5060    20

And Bernoulli. Note here that while we are supplying the Boolean matrix to textmodel_nb(), this re-weighting from the count matrix would have been performed automatically within the function had we not done so in advance - it’s done here just for comparison.

dfmat_train_bern <- dfm_weight(dfmat_train, scheme = "boolean")
dfmat_test_bern <- dfm_weight(dfmat_test, scheme = "boolean")

microbenchmark(
    textmodel_nb = {
      tmod <-  textmodel_nb(dfmat_train_bern, dfmat_train$polarity, smooth = 1, distribution = "Bernoulli")
      pred <- predict(tmod, newdata = dfmat_test)
    },
    fastNaiveBayes = { 
      tmod <- fnb.bernoulli(as(dfmat_train_bern, "dgCMatrix"), y = dfmat_train$polarity, laplace = 1, sparse = TRUE)
      pred <- predict(tmod, newdata = as(dfmat_test_bern, "dgCMatrix"))
    },
    naivebayes = {
      tmod = bernoulli_naive_bayes(as(dfmat_train_bern, "dgCMatrix"), dfmat_train$polarity, laplace = 1)
      pred <- predict(tmod, newdata = as(dfmat_test_bern, "dgCMatrix"))
    },
    times = 20
)
## Unit: milliseconds
##            expr      min       lq     mean   median       uq      max neval
##    textmodel_nb 118.9946 133.3705 135.2605 134.4165 138.9679 148.2901    20
##  fastNaiveBayes 142.1740 154.0823 171.3386 156.5314 158.1285 332.5916    20
##      naivebayes 113.2747 113.7114 121.3939 120.6428 128.8479 131.3425    20

References

Maas, Andrew L., Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng, and Christopher Potts (2011). “Learning Word Vectors for Sentiment Analysis”. The 49th Annual Meeting of the Association for Computational Linguistics (ACL 2011).

Majka M (2020). naivebayes: High Performance Implementation of the Naive Bayes Algorithm in R. R package version 0.9.7, <URL: https://CRAN.R-project.org/package=naivebayes>. Date: 2020-03-08.

Manning, Christopher D., Prabhakar Raghavan, and Hinrich Schütze (2008). Introduction to Information Retrieval. Cambridge University Press.

Skogholt, Martin (2020). fastNaiveBayes: Extremely Fast Implementation of a Naive Bayes Classifier. R package version 2.2.1. https://github.com/mskogholt/fastNaiveBayes. Date: 2020-05-04.