tokenizer.h File Reference
#include "allinc.h"
#include "base/list.h"
#include "base/map.h"
#include "base/string.h"
#include "pcre/pcre.h"
Go to the source code of this file.
|
Classes |
| class | token_info |
| | token information More...
|
| class | pcre_key |
| | pcre key More...
|
| class | pcre_entry |
| | pcre entry More...
|
| class | tokenizer |
| | class tokenize input string finding atomic tokens More...
|
Typedefs |
| typedef _list< token_info > | tokenizer_output_sequence_t |
| | list of detected tokens
|
typedef map< size_t, string_t,
less< size_t >, true > | abbreviation_map_t |
| | abbreviation map
|
typedef map< pcre_key,
pcre_entry, less< pcre_key >
, true > | regex_map_t |
| | pcre expression multimap
|
Enumerations |
| enum | token_type {
TT_UNKNOWN = 0,
TT_REGEX,
TT_ABBR,
TT_COMPOSE,
TT_ALPHABETIC,
TT_DIGIT,
TT_WHITESPACE,
TT_SYMBOL,
TT_PUNCTUATION,
TT_DOT
} |
| | atomic token type More...
|
Variables |
BEGIN_TERIMBER_NAMESPACE const
size_t | T_REGEX = 0x00000001 |
| const size_t | T_ABBR = 0x00000002 |
| const size_t | T_HYPHEN = 0x00000004 |
| const size_t | T_ALL = T_REGEX | T_ABBR | T_HYPHEN |
Typedef Documentation
Enumeration Type Documentation
atomic token type
- Enumerator:
-
| TT_UNKNOWN |
unknown type |
| TT_REGEX |
regular expressionm 2005-11-11 12:00:00.333 |
| TT_ABBR |
abbreviation Dr., Mr., Ms., Gen. |
| TT_COMPOSE |
composite token semi-final, Cup-2005, F-117 |
| TT_ALPHABETIC |
alpha-betic Terimber |
| TT_DIGIT |
digit 1, 345 |
| TT_WHITESPACE |
white space blank, tabs, new lines, caret, ... |
| TT_SYMBOL |
symbol #$^... |
| TT_PUNCTUATION |
punctuation ,:;!?- |
| TT_DOT |
dot |
Definition at line 47 of file tokenizer.h.
Variable Documentation
| const size_t T_ABBR = 0x00000002 |
| BEGIN_TERIMBER_NAMESPACE const size_t T_REGEX = 0x00000001 |