class Ferret::Analysis::TokenStream

Summary

A TokenStream enumerates the sequence of tokens, either from fields of a document or from query text.

This is an abstract class. Concrete subclasses are:

Tokenizer

a TokenStream whose input is a string

TokenFilter

a TokenStream whose input is another TokenStream

Public Instance Methods

next → token click to toggle source

Return the next token from the TokenStream or nil if there are no more tokens.

static VALUE
frb_ts_next(VALUE self)
{
    TokenStream *ts;
    Token *next;
    GET_TS(ts, self);
    next = ts->next(ts);
    if (next == NULL) {
        return Qnil;
    }

    return get_token(next);
}
text = text → text click to toggle source

Return the text that the TokenStream is tokenizing

static VALUE
frb_ts_get_text(VALUE self)
{
    VALUE rtext = Qnil;
    TokenStream *ts; 
    Data_Get_Struct(self, TokenStream, ts);
    if ((rtext = object_get(&ts->text)) == Qnil) {
        if (ts->text) {
            rtext = rb_str_new2(ts->text);
            object_set(&ts->text, rtext);
        } 
    }
    return rtext;
}
text = text → text click to toggle source

Set the text attribute of the TokenStream to the text you wish to be tokenized. For example, you may do this;

token_stream.text = File.read(file_name)
static VALUE
frb_ts_set_text(VALUE self, VALUE rtext)
{
    TokenStream *ts; 
    Data_Get_Struct(self, TokenStream, ts);
    StringValue(rtext);
    ts->reset(ts, rs2s(rtext));
    
    /* prevent garbage collection */
    rb_ivar_set(self, id_text, rtext);

    return rtext;
}