| Title: | Fast Multi-Pattern String Matching with the 'Aho-Corasick' Algorithm |
|---|---|
| Description: | Provide fast multi-pattern string matching for 'R' using the 'Aho-Corasick' algorithm, powered by the 'Rust' 'aho-corasick' crate. It builds reusable automatons for detecting matches, counting matches, locating character, extracting matched text, and replacing matches in character vectors. For more details on the 'Aho-Corasick' algorithm, please see Aho and Corasick (1975) <doi:10.1145/360825.360855>. |
| Authors: | Hao Cheng [aut, cre, cph] |
| Maintainer: | Hao Cheng <[email protected]> |
| License: | MIT + file LICENSE |
| Version: | 0.2.0 |
| Built: | 2026-06-02 18:53:10 UTC |
| Source: | https://github.com/cran/ahocorasick |
ac_build() compiles a character vector of patterns into a reusable
automaton backed by the Rust aho-corasick crate.
ac_build( patterns, match_kind = c("standard", "leftmost_first", "leftmost_longest"), implementation = c("auto", "noncontiguous_nfa", "contiguous_nfa", "dfa"), ascii_case_insensitive = FALSE, duplicate = c("keep", "error", "deduplicate") )ac_build( patterns, match_kind = c("standard", "leftmost_first", "leftmost_longest"), implementation = c("auto", "noncontiguous_nfa", "contiguous_nfa", "dfa"), ascii_case_insensitive = FALSE, duplicate = c("keep", "error", "deduplicate") )
patterns |
A character vector of non-empty patterns. |
match_kind |
Matching semantics:
|
implementation |
Rust automaton implementation. |
ascii_case_insensitive |
Use ASCII-only case-insensitive matching. Default is |
duplicate |
How duplicate patterns are handled:
|
An immutable <ac_automaton> object.
ac_locate(), ac_locate_df(), ac_detect(), ac_count(),
ac_extract(), ac_extract_df(), ac_replace(), ac_patterns().
ac <- ac_build(c("hello", "world")) length(ac) ac_info(ac)ac <- ac_build(c("hello", "world")) length(ac) ac_info(ac)
ac_count() returns the number of pattern matches in each document.
ac_count(ac, doc, overlapping = FALSE, na = c("keep", "zero", "error"))ac_count(ac, doc, overlapping = FALSE, na = c("keep", "zero", "error"))
ac |
An |
doc |
A character vector of documents to search. |
overlapping |
Default is |
na |
How to handle |
An integer vector with the same length as doc.
ac_count_file(), ac_detect(), ac_locate(), ac_extract().
if (requireNamespace("dplyr", quietly = TRUE)) { ac <- ac_build(c("hello", "world")) docs <- data.frame(doc = c("hello world", "nothing", "world")) dplyr::mutate(docs, n_matches = ac_count(ac, doc)) }if (requireNamespace("dplyr", quietly = TRUE)) { ac <- ac_build(c("hello", "world")) docs <- data.frame(doc = c("hello world", "nothing", "world")) dplyr::mutate(docs, n_matches = ac_count(ac, doc)) }
ac_count_file() returns the number of pattern matches in each file.
ac_count_file(ac, path, stream = FALSE, overlapping = FALSE)ac_count_file(ac, path, stream = FALSE, overlapping = FALSE)
ac |
An |
path |
A vector of file paths to search. |
stream |
If |
overlapping |
Default is |
An integer vector with the same length as path.
ac_count(), ac_detect_file(), ac_locate_bytes().
ac <- ac_build(c("hello", "world")) path <- tempfile() writeLines("hello hello world", path) ac_count_file(ac, path)ac <- ac_build(c("hello", "world")) path <- tempfile() writeLines("hello hello world", path) ac_count_file(ac, path)
ac_detect() returns whether each document has at least one pattern match.
ac_detect(ac, doc, na = c("keep", "false", "error"))ac_detect(ac, doc, na = c("keep", "false", "error"))
ac |
An |
doc |
A character vector of documents to search. |
na |
How to handle |
A logical vector with the same length as doc.
ac_detect_file(), ac_count(), ac_locate(), ac_extract().
if (requireNamespace("dplyr", quietly = TRUE)) { ac <- ac_build(c("hello", "world")) docs <- data.frame(doc = c("hello world", "nothing", "world")) dplyr::mutate(docs, matched = ac_detect(ac, doc)) }if (requireNamespace("dplyr", quietly = TRUE)) { ac <- ac_build(c("hello", "world")) docs <- data.frame(doc = c("hello world", "nothing", "world")) dplyr::mutate(docs, matched = ac_detect(ac, doc)) }
ac_detect_file() returns whether each file has at least one pattern match.
ac_detect_file(ac, path, stream = FALSE)ac_detect_file(ac, path, stream = FALSE)
ac |
An |
path |
A vector of file paths to search. |
stream |
If |
A logical vector with the same length as path.
ac_detect(), ac_count_file(), ac_locate_bytes().
ac <- ac_build(c("hello", "world")) path <- tempfile() writeLines("hello world", path) ac_detect_file(ac, path)ac <- ac_build(c("hello", "world")) path <- tempfile() writeLines("hello world", path) ac_detect_file(ac, path)
ac_extract() returns one list element per document. Each element contains
the matched text and the corresponding pattern values.
ac_extract(ac, doc, overlapping = FALSE, na = c("keep", "empty", "error"))ac_extract(ac, doc, overlapping = FALSE, na = c("keep", "empty", "error"))
ac |
An |
doc |
A character vector of documents to search. |
overlapping |
Default is |
na |
How to handle |
A list with the same length as doc. Each element is a data frame
with one row per match and two columns:
matches: Text matched in the document.
patterns: Pattern values corresponding to each match.
ac_extract_df(), ac_locate(), ac_detect(), ac_count().
if ( requireNamespace("dplyr", quietly = TRUE) && requireNamespace("tibble", quietly = TRUE) && requireNamespace("tidyr", quietly = TRUE) ) { ac <- ac_build(c("hello", "world")) tibble::tibble(doc = c("hello world", "nothing", "world")) |> dplyr::mutate(extracted = ac_extract(ac, doc)) |> tidyr::unnest(extracted) }if ( requireNamespace("dplyr", quietly = TRUE) && requireNamespace("tibble", quietly = TRUE) && requireNamespace("tidyr", quietly = TRUE) ) { ac <- ac_build(c("hello", "world")) tibble::tibble(doc = c("hello world", "nothing", "world")) |> dplyr::mutate(extracted = ac_extract(ac, doc)) |> tidyr::unnest(extracted) }
ac_extract_df() is the data-frame form of ac_extract(). It is useful when
you want one row per match instead of one list element per document.
ac_extract_df(ac, doc, overlapping = FALSE, na = c("omit", "keep", "error"))ac_extract_df(ac, doc, overlapping = FALSE, na = c("omit", "keep", "error"))
ac |
An |
doc |
A character vector of documents to search. |
overlapping |
Default is |
na |
How to handle |
A data frame with one row per match and three columns:
doc_id, matches, and patterns.
ac <- ac_build(c("hello", "world")) doc <- c("hello world", "nothing", "world hello") ac_extract_df(ac, doc)ac <- ac_build(c("hello", "world")) doc <- c("hello world", "nothing", "world hello") ac_extract_df(ac, doc)
ac_extract_file() returns one list element per file. Each element contains
the matched text and the corresponding pattern values.
ac_extract_file(ac, path, stream = FALSE, overlapping = FALSE)ac_extract_file(ac, path, stream = FALSE, overlapping = FALSE)
ac |
An |
path |
A vector of file paths to search. |
stream |
If |
overlapping |
Default is |
A list with the same length as path. Each element is a data frame
with one row per match and two columns:
matches: Text matched in the file.
patterns: Pattern values corresponding to each match.
ac_extract(), ac_detect_file(), ac_count_file().
ac <- ac_build(c("hello", "world")) path <- tempfile() writeLines("hello world", path) ac_extract_file(ac, path)ac <- ac_build(c("hello", "world")) path <- tempfile() writeLines("hello world", path) ac_extract_file(ac, path)
Return automaton metadata
ac_info(ac)ac_info(ac)
ac |
An |
A list of automaton metadata.
ac <- ac_build(c("hello", "world")) ac_info(ac)ac <- ac_build(c("hello", "world")) ac_info(ac)
ac_locate() searches a character vector with a compiled automaton and
returns one list element per document. Character offsets are 1-based and
inclusive, so they can be used directly with substr().
ac_locate(ac, doc, overlapping = FALSE, na = c("keep", "empty", "error"))ac_locate(ac, doc, overlapping = FALSE, na = c("keep", "empty", "error"))
ac |
An |
doc |
A character vector of documents to search. |
overlapping |
Default is |
na |
How to handle |
A list with the same length as doc. Each element is a data frame
with one row per match and three columns:
pattern_id: Index of the matched pattern in ac_patterns(ac).
start: 1-based index of the first character in each match.
end: 1-based index of the last character in each match.
ac_locate_df(), ac_locate_bytes(), ac_extract(),
ac_detect(), ac_count().
if ( requireNamespace("dplyr", quietly = TRUE) && requireNamespace("tibble", quietly = TRUE) && requireNamespace("tidyr", quietly = TRUE) ) { ac <- ac_build(c("hello", "world")) tibble::tibble(doc = c("hello world", "nothing", "world")) |> dplyr::mutate(hits = ac_locate(ac, doc)) |> tidyr::unnest(hits) }if ( requireNamespace("dplyr", quietly = TRUE) && requireNamespace("tibble", quietly = TRUE) && requireNamespace("tidyr", quietly = TRUE) ) { ac <- ac_build(c("hello", "world")) tibble::tibble(doc = c("hello world", "nothing", "world")) |> dplyr::mutate(hits = ac_locate(ac, doc)) |> tidyr::unnest(hits) }
ac_locate_bytes() searches a character vector with a compiled automaton
and returns byte offsets from the Rust aho-corasick crate. Byte offsets are
0-based, and byte_end is end-exclusive.
ac_locate_bytes(ac, doc, overlapping = FALSE, na = c("omit", "keep", "error"))ac_locate_bytes(ac, doc, overlapping = FALSE, na = c("omit", "keep", "error"))
ac |
An |
doc |
A character vector of documents to search. |
overlapping |
Default is |
na |
How to handle |
A data frame with one row per match and four columns:
doc_id, pattern_id, byte_start, and byte_end.
ac <- ac_build(c("hello", "world")) doc <- c("hello world", "nothing", "world hello") ac_locate_bytes(ac, doc)ac <- ac_build(c("hello", "world")) doc <- c("hello world", "nothing", "world hello") ac_locate_bytes(ac, doc)
ac_locate_df() is the data-frame form of ac_locate(). It is useful when
you want one row per match instead of one list element per document.
ac_locate_df(ac, doc, overlapping = FALSE, na = c("omit", "keep", "error"))ac_locate_df(ac, doc, overlapping = FALSE, na = c("omit", "keep", "error"))
ac |
An |
doc |
A character vector of documents to search. |
overlapping |
Default is |
na |
How to handle |
A data frame with one row per match and four columns:
doc_id, pattern_id, start, and end.
ac_locate(), ac_locate_bytes(), ac_extract_df().
ac <- ac_build(c("hello", "world")) doc <- c("hello world", "nothing", "world hello") ac_locate_df(ac, doc)ac <- ac_build(c("hello", "world")) doc <- c("hello world", "nothing", "world hello") ac_locate_df(ac, doc)
ac_locate_file() searches files with a compiled automaton and returns one
list element per file. Character offsets are 1-based and inclusive, so they
can be used directly with substr().
ac_locate_file(ac, path, overlapping = FALSE)ac_locate_file(ac, path, overlapping = FALSE)
ac |
An |
path |
A vector of file paths to search. |
overlapping |
Default is |
File location search is always non-streaming. Converting byte offsets from a
streaming search into R-facing character offsets would require a second pass
over the same file to reconstruct UTF-8 character boundaries. Keeping
ac_locate_file() as a simple in-memory search is the clearest
implementation.
A list with the same length as path. Each element is a data frame
with one row per match and three columns:
pattern_id: Index of the matched pattern in ac_patterns(ac).
start: 1-based index of the first character in each match.
end: 1-based index of the last character in each match.
ac_locate(), ac_detect_file(), ac_count_file(),
ac_extract_file().
ac <- ac_build(c("hello", "world")) path <- tempfile() writeLines("hello world", path) ac_locate_file(ac, path)ac <- ac_build(c("hello", "world")) path <- tempfile() writeLines("hello world", path) ac_locate_file(ac, path)
Return patterns stored in an automaton
ac_patterns(ac)ac_patterns(ac)
ac |
An |
A character vector of stored patterns.
ac <- ac_build(c("hello", "world")) ac_patterns(ac)ac <- ac_build(c("hello", "world")) ac_patterns(ac)
ac_replace() replaces all non-overlapping matches in each document with
the corresponding replacement string.
ac_replace(ac, doc, replace_with, na = c("keep", "empty", "error"))ac_replace(ac, doc, replace_with, na = c("keep", "empty", "error"))
ac |
An |
doc |
A character vector of documents to search and replace. |
replace_with |
A character vector of replacements. If length 1, the
same replacement is used for every pattern. Otherwise, it MUST have the
same length as |
na |
How to handle |
A character vector with the same length and names as doc.
ac_build(), ac_detect(), ac_count(), ac_extract(),
ac_locate().
ac <- ac_build(c("fox", "brown", "quick")) ac_replace( ac, "The quick brown fox.", c("sloth", "grey", "slow") ) ac <- ac_build(c("append", "appendage", "app"), match_kind = "leftmost_first") ac_replace(ac, "append the app to the appendage", c("x", "y", "z"))ac <- ac_build(c("fox", "brown", "quick")) ac_replace( ac, "The quick brown fox.", c("sloth", "grey", "slow") ) ac <- ac_build(c("append", "appendage", "app"), match_kind = "leftmost_first") ac_replace(ac, "append the app to the appendage", c("x", "y", "z"))
ac_replace_file() replaces all non-overlapping matches in input files and
writes the result to output files.
ac_replace_file(ac, path, replace_with, output = NULL, stream = FALSE)ac_replace_file(ac, path, replace_with, output = NULL, stream = FALSE)
ac |
An |
path |
A vector of input file paths to search and replace. |
replace_with |
A character vector of replacements. If length 1, the
same replacement is used for every pattern. Otherwise, it MUST have the
same length as |
output |
A vector of output file paths. It must have the same
length as |
stream |
If |
A character vector of output file paths with the same length as
path.
ac_replace(), ac_detect_file(), ac_count_file().
ac <- ac_build(c("fox", "brown", "quick")) path <- tempfile(fileext = ".txt") writeLines("The quick brown fox.", path) ac_replace_file(path = path, ac = ac, replace_with = c("sloth", "grey", "slow"))ac <- ac_build(c("fox", "brown", "quick")) path <- tempfile(fileext = ".txt") writeLines("The quick brown fox.", path) ac_replace_file(path = path, ac = ac, replace_with = c("sloth", "grey", "slow"))