Examples for 'NLP::annotate'


Annotate text strings

Aliases: annotate

Keywords:

### ** Examples

## A simple text.
s <- String("  First sentence.  Second sentence.  ")
##           ****5****0****5****0****5****0****5**

## A very trivial sentence tokenizer.
sent_tokenizer <-
function(s) {
    s <- as.String(s)
    m <- gregexpr("[^[:space:]][^.]*\\.", s)[[1L]]
    Span(m, m + attr(m, "match.length") - 1L)
}
## (Could also use Regexp_Tokenizer() with the above regexp pattern.)
## A simple sentence token annotator based on the sentence tokenizer.
sent_token_annotator <- Simple_Sent_Token_Annotator(sent_tokenizer)

## Annotate sentence tokens.
a1 <- annotate(s, sent_token_annotator)
a1
 id type     start end features
  1 sentence     3  17 
  2 sentence    20  35 
## A very trivial word tokenizer.
word_tokenizer <-
function(s) {
    s <- as.String(s)
    ## Remove the last character (should be a period when using
    ## sentences determined with the trivial sentence tokenizer).
    s <- substring(s, 1L, nchar(s) - 1L)
    ## Split on whitespace separators.
    m <- gregexpr("[^[:space:]]+", s)[[1L]]
    Span(m, m + attr(m, "match.length") - 1L)
}
## A simple word token annotator based on the word tokenizer.
word_token_annotator <- Simple_Word_Token_Annotator(word_tokenizer)

## Annotate word tokens using the already available sentence token
## annotations.
a2 <- annotate(s, word_token_annotator, a1)
a2
 id type     start end features
  1 sentence     3  17 constituents=<<integer,2>>
  2 sentence    20  35 constituents=<<integer,2>>
  3 word         3   7 
  4 word         9  16 
  5 word        20  25 
  6 word        27  34 
## Can also perform sentence and word token annotations in a pipeline:
p <- Annotator_Pipeline(sent_token_annotator, word_token_annotator)
annotate(s, p)
 id type     start end features
  1 sentence     3  17 constituents=<<integer,2>>
  2 sentence    20  35 constituents=<<integer,2>>
  3 word         3   7 
  4 word         9  16 
  5 word        20  25 
  6 word        27  34 

[Package NLP version 0.3-2 Index]