Aliases: annotate
Keywords:
### ** Examples ## A simple text. s <- String(" First sentence. Second sentence. ") ## ****5****0****5****0****5****0****5** ## A very trivial sentence tokenizer. sent_tokenizer <- function(s) { s <- as.String(s) m <- gregexpr("[^[:space:]][^.]*\\.", s)[[1L]] Span(m, m + attr(m, "match.length") - 1L) } ## (Could also use Regexp_Tokenizer() with the above regexp pattern.) ## A simple sentence token annotator based on the sentence tokenizer. sent_token_annotator <- Simple_Sent_Token_Annotator(sent_tokenizer) ## Annotate sentence tokens. a1 <- annotate(s, sent_token_annotator) a1
id type start end features 1 sentence 3 17 2 sentence 20 35
## A very trivial word tokenizer. word_tokenizer <- function(s) { s <- as.String(s) ## Remove the last character (should be a period when using ## sentences determined with the trivial sentence tokenizer). s <- substring(s, 1L, nchar(s) - 1L) ## Split on whitespace separators. m <- gregexpr("[^[:space:]]+", s)[[1L]] Span(m, m + attr(m, "match.length") - 1L) } ## A simple word token annotator based on the word tokenizer. word_token_annotator <- Simple_Word_Token_Annotator(word_tokenizer) ## Annotate word tokens using the already available sentence token ## annotations. a2 <- annotate(s, word_token_annotator, a1) a2
id type start end features 1 sentence 3 17 constituents=<<integer,2>> 2 sentence 20 35 constituents=<<integer,2>> 3 word 3 7 4 word 9 16 5 word 20 25 6 word 27 34
## Can also perform sentence and word token annotations in a pipeline: p <- Annotator_Pipeline(sent_token_annotator, word_token_annotator) annotate(s, p)
id type start end features 1 sentence 3 17 constituents=<<integer,2>> 2 sentence 20 35 constituents=<<integer,2>> 3 word 3 7 4 word 9 16 5 word 20 25 6 word 27 34