Neuron®
The Neuron® is the basis for the creation of open and secure federated networks for smart societies.
Loading...
Searching...
No Matches
HtmlTokenizer.cs
1using System;
2using System.Text;
3using System.Threading.Tasks;
4using Waher.Content;
10
12{
17 {
22 {
23 }
24
30 public Grade Supports(Type Type)
31 {
32 if (Type == typeof(HtmlDocument))
33 return Grade.Ok;
34 else
35 return Grade.NotAtAll;
36 }
37
43 public Grade Supports(string Extension)
44 {
45 switch (Extension)
46 {
47 case "htm":
48 case "html":
49 case "xhtml":
50 return Grade.Ok;
51
52 default:
53 return Grade.NotAtAll;
54 }
55 }
56
62 public async Task Tokenize(object Value, TokenizationProcess Process)
63 {
64 if (Value is HtmlDocument Doc)
65 await Tokenize(Doc, Process);
66 }
67
73 public static Task Tokenize(HtmlDocument Doc, TokenizationProcess Process)
74 {
75 StringBuilder sb = new StringBuilder();
76
77 GetText(Doc.Root, sb);
78
79 StringTokenizer.Tokenize(sb.ToString(), Process);
80
81 return Task.CompletedTask;
82 }
83
84 private static void GetText(HtmlNode N, StringBuilder Text)
85 {
86 if (N is HtmlElement E)
87 {
88 if (E.HasAttributes)
89 {
90 foreach (HtmlAttribute Attr in E.Attributes)
91 {
92 Text.Append(' ');
93 Text.Append(Attr.Value);
94 }
95 }
96
97 if (E.HasChildren)
98 {
99 foreach (HtmlNode N2 in E.Children)
100 GetText(N2, Text);
101 }
102 }
103 else if (N is HtmlText T)
104 {
105 Text.Append(' ');
106 Text.Append(T.InlineText);
107 }
108 }
109
115 public async Task Tokenize(FileReference Reference, TokenizationProcess Process)
116 {
117 string Text = await Resources.ReadAllTextAsync(Reference.FileName);
118 HtmlDocument Doc = new HtmlDocument(Text);
119
120 await Tokenize(Doc, Process);
121 }
122 }
123}
string Value
Attribute value.
HtmlElement Root
Root element.
Definition: HtmlDocument.cs:72
Base class for all HTML elements.
Definition: HtmlElement.cs:12
Base class for all HTML nodes.
Definition: HtmlNode.cs:11
Static class managing loading of resources stored as embedded resources or in content files.
Definition: Resources.cs:15
static async Task< string > ReadAllTextAsync(string FileName)
Reads a text file asynchronously.
Definition: Resources.cs:205
Tokenizes contents defined in an HTML document.
static Task Tokenize(HtmlDocument Doc, TokenizationProcess Process)
Tokenizes an HTML document.
HtmlTokenizer()
Tokenizes contents defined in an HTML document.
async Task Tokenize(object Value, TokenizationProcess Process)
Tokenizes an object.
Grade Supports(string Extension)
How well the file tokenizer supports files of a given extension.
Grade Supports(Type Type)
If the interface understands objects such as Type .
async Task Tokenize(FileReference Reference, TokenizationProcess Process)
Tokenizes an object.
Contains a reference to an indexed file.
CaseInsensitiveString FileName
Name of collection hosting object.
Task Tokenize(object Value, TokenizationProcess Process)
Tokenizes an object.
Contains information about a tokenization process.
Interface for file tokenizers. Best tokenizer is selected
Interface for full-text-search tokenizers
Definition: ITokenizer.cs:12
Grade
Grade enumeration
Definition: Grade.cs:7