Neuron®
The Neuron® is the basis for the creation of open and secure federated networks for smart societies.
Loading...
Searching...
No Matches
StringTokenizer.cs
1using System;
2using System.Collections.Generic;
3using System.Globalization;
4using System.Text;
5using System.Threading.Tasks;
7
9{
14 {
19 {
20 }
21
27 public Grade Supports(Type Type)
28 {
29 if (Type == typeof(string))
30 return Grade.Ok;
31 else
32 return Grade.NotAtAll;
33 }
34
40 public Task Tokenize(object Value, TokenizationProcess Process)
41 {
42 if (Value is string s)
43 Tokenize(s, Process);
44
45 return Task.CompletedTask;
46 }
47
53 public static void Tokenize(string Text, TokenizationProcess Process)
54 {
55 if (string.IsNullOrEmpty(Text))
56 return;
57
58 UnicodeCategory Category;
59 StringBuilder sb = new StringBuilder();
60 string Token;
61 bool First = true;
62
63 foreach (char ch in Text.ToLower().Normalize(NormalizationForm.FormD))
64 {
65 Category = CharUnicodeInfo.GetUnicodeCategory(ch);
66 if (Category == UnicodeCategory.NonSpacingMark)
67 continue;
68
69 if (char.IsLetterOrDigit(ch))
70 {
71 sb.Append(ch);
72 First = false;
73 }
74 else
75 {
76 if (!First)
77 {
78 Token = sb.ToString();
79 sb.Clear();
80 First = true;
81
82 if (!Process.TokenCounts.TryGetValue(Token, out List<uint> DocIndex))
83 {
84 DocIndex = new List<uint>();
85 Process.TokenCounts[Token] = DocIndex;
86 }
87
88 DocIndex.Add(++Process.DocumentIndexOffset);
89 }
90 }
91 }
92
93 if (!First)
94 {
95 Token = sb.ToString();
96 sb.Clear();
97
98 if (!Process.TokenCounts.TryGetValue(Token, out List<uint> DocIndex))
99 {
100 DocIndex = new List<uint>();
101 Process.TokenCounts[Token] = DocIndex;
102 }
103
104 DocIndex.Add(++Process.DocumentIndexOffset);
105 }
106 }
107
108 }
109}
Grade Supports(Type Type)
How well the tokenizer can tokenize objects of type Type .
static void Tokenize(string Text, TokenizationProcess Process)
Tokenizes a set of strings.
Task Tokenize(object Value, TokenizationProcess Process)
Tokenizes an object.
Contains information about a tokenization process.
Interface for full-text-search tokenizers
Definition: ITokenizer.cs:12
Grade
Grade enumeration
Definition: Grade.cs:7