Mega Code Archive

Parsehtml

The following code demonstrates how to parse a html file looking for Begin Tag End Tag Raw Text The following routine demonstrates how to parse a html file. I welcome feed back to improve the routine, if you have any suggestions/hints please let me know. rgds Si Carter ---------------- BEGIN CODE BLOCK ------------------------ unit HTMLParse; (*************************************************************************** HTMLParse Purpose: Parse a html file to extract tags and plain text. Copyright © 2003 - TECT Software Ltd. All Rights Reserved. All code remains the property of TECT Software Ltd and may not be changed without permission. Use of this code is granted to any developer for private, open source or commercial applications. No warranty expressed or implied. Use at own risk. Contact: WEB - www.tectsoft.com EMail - support@tectsoft.com Copyright Notice Must Remain With File. Visit www.tectsoft.com for *low cost* developer friendly web hosting. Requires: FastStrings from http://www.droopyeyes.com Usage: See www.howtodothings.com for demo usage. ****************************************************************************) interface uses Classes, FastStringFuncs, FastStrings; type TTagType = (ttBeginTag, ttEndTag, ttRawText); THTMLParseProc = procedure(const HTMLData: string; TagType: TTagType; Parameters: TStrings); procedure ParseHTML(const HTML: string; ParseProc: THTMLParseProc); implementation uses SysUtils; const (* NOTE: download the file below, the following codes are wrong when displayed in a browser like this :-) *) THTMLReplaceWords: array[0..4] of array[0..1] of string = ((' ', ' '), ('&', '&'), ('<', '<'), ('>', '>'), ('"', '"')); procedure ParseHTML(const HTML: string; ParseProc: THTMLParseProc); procedure CallTagProc(IsTag: Boolean; HTMLData: string); var s: string; sl: TStringList; I: Integer; begin HTMLData := Trim(HTMLData); if Length(HTMLData) > 0 then begin if IsTag then begin if Pos(' ', HTMLData) > 0 then s := Trim(Copy(HTMLData, 1, Pos(' ', HTMLData))) else s := Trim(HTMLData); sl := TStringList.Create; try sl.Text := Trim(Copy(HTMLData, Length(s) + 1, length(HTMLData))); sl.Text := Trim(FastReplace(sl.Text, ';', #13)); sl.Text := Trim(FastReplace(sl.Text, '" ', #13)); sl.Text := Trim(FastReplace(sl.Text, '"', '')); if LeftStr(s, 1) = '/' then THTMLParseProc(ParseProc)(uppercase(s), ttEndTag, sl) else THTMLParseProc(ParseProc)(UpperCase(s), ttBeginTag, sl); finally sl.Free; end; end else begin for I := 0 to 4 do HTMLData := FastReplace(HTMLData, THTMLReplaceWords[I, 0], THTMLReplaceWords[I, 1]); THTMLParseProc(ParseProc)(HTMLData, ttRawText, nil); end; end; end; var s: string; P: PChar; begin Assert(Assigned(ParseProc)); P := PChar(HTML); s := ''; while P^ <> #0 do begin case P^ of '<': begin CallTagProc(False, s); s := ''; end; '>': begin CallTagProc(True, s); s := ''; end; else s := s + P^; end; //case Inc(P); end; end; end.