Generating Small, Medium, and Large Datasets

Overview

This vignette demonstrates how to use the {samplezoo} package to generate datasets of varying sizes (small, medium, and large) with variables from multiple probability distributions.

Each dataset contains:

  • Variables/columns from common distributions such as Normal, Binomial, Poisson, and others.

  • Adjustable sample sizes to meet needs.

library(samplezoo)

Generate a small dataset (i.e., 100 rows)

data_small <- samplezoo("small")
head(data_small)
#>       norm    norm2     norm3 binom neg pois       exp      unif      beta
#> 1 25.15694 65.80074 42.402275     1   0    2  1.828935 0.4606579 0.2356178
#> 2 25.73720 45.59507  5.305106     0   0    5 29.517268 0.1910983 0.3974960
#> 3 52.14277 63.15152 23.951638     1   2    3 12.723542 0.5519936 0.4568973
#> 4 59.59455 62.11344 61.097426     0   0    0 18.018434 0.3936939 0.1238944
#> 5 56.22166 50.14880 17.276617     0   0    4  4.299494 0.3424037 0.5205620
#> 6 64.40413 69.64359 39.579413     0   5    6  2.044494 0.3128539 0.0907647
#>      gamma     chisq     t_dist
#> 1 2.993321 0.5156396 -1.8582884
#> 2 2.644299 0.2756526  0.5036235
#> 3 1.969294 5.8376690 -1.9433715
#> 4 7.115710 3.7607821  0.7088352
#> 5 8.076411 2.6783601  1.6695084
#> 6 6.180865 0.1764780  0.3286485

Generate a medium sized dataset (i.e., 1,000 rows)

data_medium <- samplezoo("medium")
head(data_medium)
#>       norm    norm2    norm3 binom neg pois        exp       unif       beta
#> 1 66.32780 65.03845 40.73076     0   4    1  0.8459338 0.47356267 0.05140996
#> 2 15.60933 62.24251 35.66566     0   0    3  4.4488537 0.97937912 0.41373055
#> 3 67.99604 65.68543 69.63832     0   4    4 13.5834844 0.07347576 0.49006833
#> 4 77.54035 46.96659 47.95241     1   0    3  6.0745628 0.66047941 0.35677391
#> 5 39.07681 47.36226 49.83504     0   1    4 15.2113350 0.86218629 0.35538683
#> 6 63.32963 56.16136 76.07890     1   1    5  0.9421500 0.79081379 0.22499517
#>      gamma     chisq     t_dist
#> 1 2.096289  3.814471  0.9809230
#> 2 3.537228  3.762445  0.4532561
#> 3 2.665679  9.730170 -1.9000670
#> 4 3.794769  2.938662  0.4266126
#> 5 7.348275  3.732159 -0.3562102
#> 6 6.715543 11.616382  0.8545484

Generate a large sized dataset (i.e., 10,000 rows)

data_large <- samplezoo("large")
head(data_large)
#>       norm    norm2     norm3 binom neg pois       exp      unif      beta
#> 1 67.38657 70.42156 -2.879439     0   2    2 14.114451 0.5490409 0.3784222
#> 2 63.55910 66.85273 40.235570     1   2    3 20.045941 0.7833406 0.4055449
#> 3 76.08150 53.92179 37.981351     0   1    3  4.900619 0.2500442 0.3095650
#> 4 31.57206 62.14065 27.116840     0   1    6  8.828513 0.6659334 0.1559601
#> 5 63.39814 50.32682 58.956412     0   0    4  2.681587 0.9681289 0.2035393
#> 6 25.19843 56.45019 69.429303     0   0    3  1.357291 0.8655445 0.4260651
#>       gamma    chisq     t_dist
#> 1  3.494189 7.849484  3.3730323
#> 2  2.403993 9.112495  0.4460709
#> 3  4.533870 9.779585 -0.8845991
#> 4  1.228562 9.786626  0.3415335
#> 5 10.894135 8.356028 -1.8756460
#> 6  2.971456 4.546987 -1.3489405

Adding Variation or Ensuring Reproducibility with set.seed()

To ensure reproducibility and introduce controlled variation in your dataset, use set.seed() before generating random data.

set.seed(123)
data_large <- samplezoo("large")
head(data_large)
#>       norm    norm2     norm3 binom neg pois       exp      unif       beta
#> 1 41.59287 83.70725 23.274065     0   1    6  6.628373 0.5468223 0.08294255
#> 2 46.54734 58.33188 35.588540     0   0    5 21.305366 0.3900809 0.63544684
#> 3 73.38062 69.26961 -2.070295     0   2    4  0.189645 0.7262119 0.11520674
#> 4 51.05763 54.31848  6.643849     0   2    2  8.479098 0.5101462 0.38184206
#> 5 51.93932 62.25090 18.040743     0   0    2 11.885521 0.2964126 0.17196046
#> 6 75.72597 71.31986  6.687576     0   1    4  6.363993 0.1442317 0.35908460
#>       gamma     chisq     t_dist
#> 1 6.9893762 10.286282 -0.3814568
#> 2 5.4087626  6.519658 -2.3409216
#> 3 1.2587867  8.011417 -0.4744159
#> 4 0.9871787 14.780626  0.4292511
#> 5 2.4021943  6.799788 -0.6692669
#> 6 4.2109032 17.858701 -0.3370763
set.seed(456)
data_large <- samplezoo("large")
head(data_large)
#>       norm    norm2      norm3 binom neg pois        exp      unif       beta
#> 1 29.84718 68.13494  7.9885694     0   0    5  3.4417303 0.8866347 0.05413307
#> 2 59.32663 52.32066 21.2526086     0   3    3  0.8114356 0.7976466 0.07195440
#> 3 62.01312 62.47569 38.4789563     0   2    6 46.8038907 0.6469920 0.22555129
#> 4 29.16661 53.51086 -0.8656269     0   1    5 11.6955326 0.2036753 0.71455809
#> 5 39.28465 47.19406 47.7819258     1   1    1  0.3535625 0.3653401 0.34619912
#> 6 45.13908 63.33566 53.3620528     1   1    2  4.5592136 0.7628573 0.25880522
#>       gamma     chisq     t_dist
#> 1 6.7914120  4.464348 -1.0150596
#> 2 3.0132520  8.062120  0.3262369
#> 3 4.7360954 10.969593  1.5141157
#> 4 5.1235878  6.249247  0.6432708
#> 5 6.6851637  4.358815  0.2025742
#> 6 0.3903841 20.019575  1.6257109