This is a classic library that can help you generate valid XHTML from HTML. It also provides support for tag and attribute filtering. You can specify which tags and attributes are allowed to appear in the output, while other tags are filtered out. You can also use this library to clean up bloated HTML generated when converting Microsoft Word documents to HTML. You should also clean up the HTML before publishing it to your blog site, otherwise blog engines like WordPress, b2evolution, etc. will reject it.
There are two classes: HtmlReader and HtmlWriter
HtmlReader extends the famous SgmlReader developed by Chris Clovett. When it reads HTML, it skips all prefixed nodes. Among them, hundreds of useless tags such as
HtmlWriter extends the conventional XmlWriter, and XmlWriter generates XML. XHTML is essentially HTML in XML format. All the tags you are familiar with, such as ,
and
HtmlReader is very simple. Here is the complete class:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
////// This class skips all nodes which has some
/// kind of prefix. This trick does the job
/// to clean up MS Word/Outlook HTML markups.
///public class HtmlReader : Sgml.SgmlReader
{
public HtmlReader(TextReader reader): base()
{
base.InputStream = reader;
base.DocType = "HT ML" ;
}
public HtmlReader(string content): base()
} {
base.InputStream = new StringReader(content);
Base.dOctype = "HTML";
}
Public Override Bool Read ()
{
BOOL Status = Base.read ();
if (status)
{
if ( base.NodeType == XmlNodeType.Element )
{
// Got a node with prefix. This must be one
// of those "" or something else.
// Skip this node entirely. We want prefix
// less nodes so that the resultant XML
// requires not namespace.
if ( base.Name.IndexOf( ':' ) > 0 )
base.Skip();
}
}
return status;
}
}
这个类是有点麻烦。下面是使用技巧:
重写WriteString方法并避免使用常规的XML编码。对HTML文件手动更改编码。
重写WriteStartElementis以避免不被允许的标签写到输出中。
重写WriteAttributesis以避免不需求的属性。
让我们分部分来看下整个类:
你可以通过修改下面的部分配置HtmlWriter:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
public class HtmlWriter : XmlTextWriter
{
////// If set to true, it will filter the output
/// by using tag and attribute filtering,
/// space reduce etc
///public bool FilterOutput = false;
////// If true, it will reduce consecutive with one instance
///public bool ReduceConsecutiveSpace = true;
////// Set the tag names in lower case which are allowed to go to output
///public string [] AllowedTags =
new string[] { "p" , "b" , "i" , "u" , "em" , "big" , "small" ,
"div" , "img" , "span" , "blockquote" , "code" , "pre" , "br" , "hr" ,
"ul" , "ol" , "li" , "del" , "ins" , "strong" , "a" , "font" , "dd" , "dt" };
////// If any tag found which is not allowed, it is replaced by this tag.
/// Specify a tag which has least impact on output
///public string ReplacementTag = "dd";
////// New lines rn are replaced with space
/// which saves space and makes the
/// output compact
///public bool RemoveNewlines = true;
////// Specify which attributes are allowed.
/// Any other attribute will be discarded
///public string [] AllowedAttributes = new string[]
{
"class" , "href" , "target" , "border" , "src" ,
"align" , "width" , "height" , "color" , "size"
};
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
////// The reason why we are overriding
/// this method is, we do not want the output to be
/// encoded for texts inside attribute
/// and inside node elements. For example, all the
/// gets converted to   in output. But this does not
/// apply to HTML. In HTML, we need to have as it is.
//////public override void WriteString(string text)
{
// Change all non-breaking space to normal space
text = text.Replace( " " , " " );
/// When you are reading RSS feed and writing Html,
/// this line helps remove those CDATA tags
text = text.Replace( "" , "" );
// Do some encoding of our own because
// we are going to use WriteRaw which won't
// do any of the necessary encoding
text = text.Replace( "<" , "<" );
text = text.Replace( ">" , ">" );
text = text.Replace( "'" , "'" );
text = text.Replace( """ , "" e;" );
if ( this .FilterOutput )
{
text = text.Trim();
// We want to replace consecutive spaces
// to one space in order to save horizontal width
if ( this .ReduceConsecutiveSpace )
text = text.Replace( " " , " " );
if ( this .RemoveNewlines )
text = text.Replace(Environment.NewLine, " " );
base.WriteRaw( text );
}
else
{
base.WriteRaw( text );
}
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
public override void WriteStartElement(string prefix,
string localName, string ns)
{
if ( this .FilterOutput )
{
bool canWrite = false ;
string tagLocalName = localName.ToLower();
foreach( string name in this .AllowedTags )
{
if ( name == tagLocalName )
{
canWrite = true ;
break ;
}
}
if ( !canWrite )
localName = "dd" ;
}
base.WriteStartElement(prefix, localName, ns);
}
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
bool canWrite = false ;
string attributeLocalName = reader.LocalName.ToLower();
foreach( string name in this .AllowedAttributes )
{
if ( name == attributeLocalName )
{
canWrite = true ;
break ;
}
}
// If allowed, write the attribute
if ( canWrite )
this .WriteStartAttribute(reader.Prefix,
attributeLocalName, reader.NamespaceURI);
while (reader.ReadAttributeValue())
{
if (reader.NodeType == XmlNodeType.EntityReference)
{
if ( canWrite ) this .WriteEntityRef(reader.Name);
continue ;
}
if ( canWrite ) this .WriteString(reader.Value);
}
if ( canWrite ) this .WriteEndAttribute();
示例应用是一个你可以立即用来清理HTML文件的实用工具。你可以将这个类应用在像博客等需要发布一些HTML到Web服务的工具中。