library("quanteda")
## Package version: 4.4
## Unicode version: 15.1
## ICU version: 74.2
## Parallel computing: 4 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
library("quanteda.textmodels")quanteda.textmodels implements fast methods for fitting and predicting Naive Bayes textmodels built especially for sparse document-feature matrices from textual data. It implements two models: multinomial and Bernoulli. (See Manning, Raghavan, and Schütze 2008, Chapter 13.)
Here, we compare performance for the two models, and then to the performance from two other packages for fitting these models.
For these tests, we will choose the dataset of 50,000 movie reviews from Maas et. al. (2011). We will use their partition into test and training sets for training and fitting our models.
## Error in load(url("https://quanteda.org/data/data_corpus_LMRD.rda")) :
## cannot open the connection to 'https://quanteda.org/data/data_corpus_LMRD.rda'
# large movie review database of 50,000 movie reviews
load(url("https://quanteda.org/data/data_corpus_LMRD.rda"))
dfmat <- tokens(data_corpus_LMRD) %>%
dfm()
dfmat_train <- dfm_subset(dfmat, set == "train")
dfmat_test <- dfm_subset(dfmat, set == "test")Comparing the performance of fitting the model:
library("microbenchmark")
microbenchmark(
multi = textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "multinomial"),
bern = textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "Bernoulli"),
times = 20
)
## Unit: milliseconds
## expr min lq mean median uq max neval
## multi 17.10912 17.49196 27.79181 18.08311 22.91028 159.9041 20
## bern 18.74859 19.14490 21.80070 19.56300 23.88914 32.7407 20And for prediction:
microbenchmark(
multi = predict(textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "multinomial"),
newdata = dfmat_test),
bern = predict(textmodel_nb(dfmat_train, dfmat_train$polarity, distribution = "Bernoulli"),
newdata = dfmat_test),
times = 20
)
## Unit: milliseconds
## expr min lq mean median uq max neval
## multi 18.62859 18.80847 20.39304 19.03365 23.01303 24.52336 20
## bern 23.97790 24.35765 27.10122 28.22408 28.96660 33.08180 20Now let’s see how textmodel_nb() compares to equivalent
functions from other packages. Multinomial:
library("fastNaiveBayes")
library("naivebayes")
## naivebayes 1.0.0 loaded
## For more information please visit:
## https://majkamichal.github.io/naivebayes/
microbenchmark(
textmodels = {
tmod <- textmodel_nb(dfmat_train, dfmat_train$polarity, smooth = 1, distribution = "multinomial")
pred <- predict(tmod, newdata = dfmat_test)
},
fastNaiveBayes = {
tmod <- fnb.multinomial(as(dfmat_train, "dgCMatrix"), y = dfmat_train$polarity, laplace = 1, sparse = TRUE)
pred <- predict(tmod, newdata = as(dfmat_test, "dgCMatrix"))
},
naivebayes = {
tmod = multinomial_naive_bayes(as(dfmat_train, "dgCMatrix"), dfmat_train$polarity, laplace = 1)
pred <- predict(tmod, newdata = as(dfmat_test, "dgCMatrix"))
},
times = 20
)
## Unit: milliseconds
## expr min lq mean median uq max neval
## textmodels 18.74138 19.00528 20.89362 19.79043 23.21607 24.17217 20
## fastNaiveBayes 20.83614 20.97519 29.15819 22.93798 24.76631 145.46977 20
## naivebayes 19.91762 20.21023 23.69874 22.38225 24.23613 52.12602 20And Bernoulli. Note here that while we are supplying the Boolean
matrix to textmodel_nb(), this re-weighting from the count
matrix would have been performed automatically within the function had
we not done so in advance - it’s done here just for comparison.
dfmat_train_bern <- dfm_weight(dfmat_train, scheme = "boolean")
dfmat_test_bern <- dfm_weight(dfmat_test, scheme = "boolean")
microbenchmark(
textmodel_nb = {
tmod <- textmodel_nb(dfmat_train_bern, dfmat_train$polarity, smooth = 1, distribution = "Bernoulli")
pred <- predict(tmod, newdata = dfmat_test)
},
fastNaiveBayes = {
tmod <- fnb.bernoulli(as(dfmat_train_bern, "dgCMatrix"), y = dfmat_train$polarity, laplace = 1, sparse = TRUE)
pred <- predict(tmod, newdata = as(dfmat_test_bern, "dgCMatrix"))
},
naivebayes = {
tmod = bernoulli_naive_bayes(as(dfmat_train_bern, "dgCMatrix"), dfmat_train$polarity, laplace = 1)
pred <- predict(tmod, newdata = as(dfmat_test_bern, "dgCMatrix"))
},
times = 20
)
## Unit: milliseconds
## expr min lq mean median uq max neval
## textmodel_nb 23.94434 24.19664 26.30612 24.91017 28.26231 31.93616 20
## fastNaiveBayes 23.27066 26.91173 26.85791 27.16503 27.45729 27.94757 20
## naivebayes 21.81746 21.94335 30.51446 25.57582 25.92005 151.77917 20Maas, Andrew L., Raymond E. Daly, Peter T. Pham, Dan Huang, Andrew Y. Ng, and Christopher Potts (2011). “Learning Word Vectors for Sentiment Analysis”. The 49th Annual Meeting of the Association for Computational Linguistics (ACL 2011).
Majka M (2020). naivebayes: High Performance Implementation of the Naive Bayes Algorithm in R. R package version 0.9.7, <URL: https://CRAN.R-project.org/package=naivebayes>. Date: 2020-03-08.
Manning, Christopher D., Prabhakar Raghavan, and Hinrich Schütze (2008). Introduction to Information Retrieval. Cambridge University Press.
Skogholt, Martin (2020). fastNaiveBayes: Extremely Fast Implementation of a Naive Bayes Classifier. R package version 2.2.1. https://github.com/mskogholt/fastNaiveBayes. Date: 2020-05-04.