Mega Code Archive

 
Categories / Delphi / Examples
 

Boyermooresearch

{'SEARCH' module for WRITER'S TOOLKIT package...} {A program to implement a fast text search algorithm for use with LARGE files. The search routine uses a variation of the Boyer-Moore Search Algorithm (adapted by the autho). The program deals with large files by searching a piece at a time. The 'pieces' exist as buffers in memory, and since the search algorithm involves 'backing up' in the file from time to time, this is taken into account when loading the next n source file bytes into the buffer (so we backtrack searchPattLen characters in the source file before reading the next buffer)... NOTE THAT THIS IS one UNIT TAKEN FROM A COMPLETE PROGRAM 'AS IS', without editing, without stripping out the lines of code that don't compile, so if you want to use the GOOD code that's here it's up to you to strip out out what you need, etc... but I promise you, this should not be difficult!!!} interface uses Windows, Messages, SysUtils, Classes, Graphics, Controls, Forms, Dialogs, StdCtrls, IniFiles; const {remember to always put your const declarations before your type declarations in this section of a Delphi Pascal unit...} {declare maximum size of buffer to be 63K, which is 64512 bytes. Disk cluster sizes are usually multiples of 1024 bytes, so making maxBufferSize also a multiple of 1024 can help speed up disc reads} maxBufferSize = 1024 * 63; {an 'master' constant to determine max length of edit box input, the length of lines in the Search Results window, and also the length of 'chunks' of text extracted from source files...} maxLineLen = 82; {maximum allowable length of search pattern input by the user...} maxInputLen = maxLineLen; {length of data 'chunk' to extract from the source string at the position of a found match: the chunkLen variable is used when we pull out a chunk of characters from the source file at the position where a match is found -note that chunkLen must be less than maxInputLen...} chunkLen = maxLineLen; {the next constant is used to limit searches to no more than 2000 'hits' -the assumption is that users i) don't want to sit and wait while the search algorithm finds every instance of 'the' in all of Shakespeare, ii) 2000 hits is already an intractably large amount from the user's point of view...} maxFoundMatches = 2000; {define a constant for the 'maximum ASCII index', bearing in mind that the ASCII table starts at 0 and ends at 255...} maxASCindex = 255; {in Delphi string indexes go from 1 to 255 by default, unless you want longer strings in which case you can set a compiler switch for that...} maxStringLen = 255; helpFileName = 'Search.hlp'; iniFileName = 'WTSearch.ini'; type {define a buffer as a character array type of length maxBufferSize...} TSearchBufferArray = array[1..maxBufferSize] of char; {and define a pointer type to point into the buffer} TSearchBuffer = ^TSearchBufferArray; TMatchInfo = record fileNum: Integer; filePos: LongInt; fileSize: LongInt; end; TVisibleChars = set of Char; type TSrchForm = class(TForm) SDomainButton: TButton; PromptLabel: TLabel; PatternEdit: TEdit; SearchButton: TButton; CloseButton: TButton; HelpButton: TButton; procedure PutClipTextInEditBox; procedure FormCreate(Sender: TObject); procedure FormClose(Sender: TObject; var Action: TCloseAction); procedure FormDestroy(Sender: TObject); procedure CloseButtonClick(Sender: TObject); function CreateBuffer: Boolean; procedure DestroyBuffer; procedure InitSkipArray; procedure InitMInfoArray; procedure InitialiseSearch; procedure ProcessInputText; procedure SeparateANDInput; function CheckPlusInputOK: Boolean; procedure SearchButtonClick(Sender: TObject); procedure AddFileSize(fileNumber: Integer; fileSize: Integer); function LoadSourceFile(srcFileName: String): String; procedure OverwriteDuffBufferChars; function GetError(const ErrorCode: Integer): String; procedure DefaultSearch; procedure ANDSearch; procedure CapitaliseBuffer; procedure GetChunk(bufPos: LongInt); function EditLine(tempLine: array of Char; i: Integer): String; procedure SearchButtonKeyDown(Sender: TObject; var Key: Word; Shift: TShiftState); procedure SDomainButtonClick(Sender: TObject); procedure EndSearch; procedure HelpButtonClick(Sender: TObject); procedure ReadINIFile; procedure WriteINIFile; private {private declarations} {all variables declared here are global to THIS unit...} {define a buffer variable (which is of pointer type, don't forget)...} buffer : TSearchBuffer; {these other variables are 'global' too of course...} {declare a 'skip' array -see psuedocode to understand it's function...} skip: array[0..maxASCindex] of Integer; {next variable, in conjunction with the chunkLen constant, helps us decide how much text to extract from the source file at ther position of a found match...} halfaChunk: Integer; {the numMatches variable has to be global since we need to keep track of the number of hits found during a search, and in the process of searching we may call a search routine many many times, so that having numMatches local to Search() wouldn't be too clever, would it now...} numMatches: Integer; {next variable is used if a user wants to interrupt a search by pressing the ESC key...} stopSearch: Boolean; {next variable used to keep track of the identity of the current file} currFileNum: Integer; {and this one to keep track of it's size...} currFileSize: LongInt; {see the SeparateANDInput routine for the way we use VisibleChars} VisibleChars: TVisibleChars; public {public declarations...} {declare a string for the search pattern -first index is at pos 1 remember} searchPatt: String; {and a similar one for 'AND' searches (where the pattern is in two parts)..} searchPattTwo: String; {declare a variable to hold the length of the search pattern- we use this in various places including the search routine, obviously, but we also use it to backtrack pattLen chars in the source file before loading a new buffer into memory -this ensures that the search algorithm does not miss any matches at the join between buffers as backtracking is involved in the search process...} pattLen: Integer; {and a similar one for 'AND' searches (where the pattern is in two parts)..} pattLenTwo: Integer; {a variable to help keep track of different kinds of searches...} searchType: String; {next a list of strings to hold the names of all user-selected files in the 'search domain'...} fileList: TStringList; {this 'match info array' will be used to hold reference info on i) the source file(name) ii) file position for each found match, and iii) the size of each source file we scan -we can find the name by using the file 'number' as an index into a file list...} mInfoArray: array[0..maxFoundMatches] of TMatchInfo; {next variables help define the number of characters separating words in 'AND' searches...} ANDsearchWidth: Integer; defaultANDsearchWidth: Integer; {Boolean variable to help keep track of whether or not current search is 'case-sensitive' or 'case-insensitive' -the DEFAULT is case-sensitive...} caseInSensitive: Boolean; end; var SrchForm: TSrchForm; implementation uses SDUnit, PageUnit, SREUnit; {NOTES ON OTHER UNITS: the ResultsForm holds a Memo component, -earlier version used a RichEdit which is like a Memo but which allowed greater control at run-time, of font styles in particular- however, it's easier to synchronise scrolling between two Memos than it is between a Memo and a RichEdit. The Memo component has the following properties set: ReadOnly is True, ScrollBars is ssVertical, and the Font is 12 point...} {NOTES ON OTHER UNITS: the SDForm (for Search Domain form) holds a FileListBox, a DriveComboBox, a DirectoryListBox, and a FilterComboBox. These are set up to talk to each other, allowing the user to specify a number of files to search in, so they can add or subtract files as necessary...} {NOTES ON OTHER UNITS: the PageForm holds the PageMemo which is used to display a whole page of source text if the user double-clicks on a line in the 'search results' window...} {$R *.DFM} procedure TSrchForm.PutClipTextInEditBox; {here we simply put whatever text the user has selected in the main 'WP/Text-Editor' window INTO the Edit Box of the Search input form, leaving the user to provide a Search Domain before they can proceed...} var inputOK: Boolean; begin {get text FROM the Clipboard...} PatternEdit.Clear; PatternEdit.PasteFromClipboard; end; procedure TSrchForm.FormCreate(Sender: TObject); {use the 'tag' attribute of the main form to ascertain whether or not we are accessing the form for the first time, and set various variables...} begin if (SrchForm.Tag = 0) then begin {it's very easy to forget to 'create' an instance of a StringList object -which is what we do in the next line...} fileList := TStringList.Create; {xxxDODGY???} fileList.Clear; end; SrchForm.Tag := SrchForm.Tag + 1; defaultAndsearchWidth := 10; ANDsearchWidth := defaultAndsearchWidth; {the minus 2 below is a safety measure...} SrchForm.PatternEdit.MaxLength := (maxLineLen - 2); {next line shouldn't be necessary but just to be on the safe side...} searchType := 'DEFAULT_SEARCH'; caseInSensitive := False; {the set below is a complete set of visible chars, BUT for our purposes we must EXCLUDE the '+' character (Chr(44), to facilitate 'AND' searches) so really the set below becomes the VisibleCharsMinusPlusSign set...} VisibleChars := [Chr(33)..Chr(42),Chr(44)..Chr(126),Chr(145)..Chr(151),Chr(161)..Chr(171), Chr(173)..Chr(174),Chr(176)..Chr(181),Chr(183)..Chr(255)]; {now use .INI file information to position things as they were when last used} ReadINIFile; end; procedure TSrchForm.FormClose(Sender: TObject; var Action: TCloseAction); begin WriteIniFile; Close; Application.Terminate; end; procedure TSrchForm.CloseButtonClick(Sender: TObject); {we DON'T want to close down this form completely or free up memory used for our 'fileList' stringList. Better to HIDE the form and free up that memory in a FormDestroy event handler...} begin SrchForm.Hide; end; procedure TSrchForm.FormDestroy(Sender: TObject); begin {free up the memory used by the file list for other applications...} fileList.Free; end; function TSrchForm.CreateBuffer: Boolean; {create a maxBufferSize buffer, but don't do anything with it...} var MemoryStatus: Boolean; begin MemoryStatus := True; try {to allocate memory} getmem(Buffer, maxBufferSize) except MemoryStatus := False; end; {return True if there IS enough memory, return False if there isn't...} Result := MemoryStatus; end; procedure TSrchForm.DestroyBuffer; {free the memory that Buffer points to...} begin freemem(Buffer, maxBufferSize); end; procedure TSrchForm.InitSkipArray; {set up a 'skip' array for use in the search algorithm whereby each character in the search pattern has a number associated with it telling us how far we should move to the left in the source file if we find a sourcefile character that is (somewhere) in the search pattern... see elsewhere for psuedocode on the detail of the algorithm- it's too esoteric to detail here...} var i, j: Integer; begin {set up the skip array so that all characters in the pattern have a numeric value associated with them, which is THE NUMBER OF CHARACTERS FROM THE END OF THE PATTERN. Where there is more than one instance of a character, then use the RIGHTMOST. So that for 'weed', skip(d) = 0, skip(e) = 1, and skip(w) = 3. For characters which do not appear in the search pattern, then the corresponding value in the skip array is the LENGTH OF THE SEARCH PATTERN. This enables us to jump forward pattLen places in the source text where appropriate...} for i := 0 to 255 do begin skip[i] := pattLen; end; for i := 0 to 255 do begin for j := 1 to (pattLen - 1) do begin if (searchPatt[j] = Chr(i)) then begin skip[i] := pattLen - j; end; {end if} end; {end for} end; {end for} end; procedure TSrchForm.InitMInfoArray; var i: Integer; begin for i := 0 to maxFoundMatches do begin mInfoArray[i].fileNum := 0; mInfoArray[i].filePos := 0; mInfoArray[i].fileSize := 0; end; end; procedure TSrchForm.InitialiseSearch; {set the values of all manner of global variables, set up the 'pattern set' and 'skip array' variables, display 'BUSY' caption and hourglass cursor, and hide the 'search results' form...} var i: Integer; begin numMatches := 0; currFileNum := 0; stopSearch := False; InitMInfoArray; {in next two lines we determine the amount of text both before and after a match to extract (along with the match sequence itself) and by this means we extract a consistent total size of 'chunk'...} halfaChunk := ((chunkLen - pattLen) div 2); if (halfaChunk < 0) then halfaChunk := 10; {if we've already done a search then ensure our Memo form is hidden- this ensures that the user will not have to sit and wait while the program writes to a displayed Memo (which would slow everything down)...} ResultsForm.ClearMemos; ResultsForm.Hide; {now convert the search pattern to uppercase IF caseInSensitive is True...} if (caseInsensitive = True) then begin searchPatt := AnsiUpperCase(searchPatt); if (searchType = 'AND_SEARCH') then searchPattTwo := AnsiUpperCase(searchPattTwo); end; {create a 'skip array'...} InitSkipArray; {do stuff to show that the program IS actually running like shit off a stick} SrchForm.Caption := ' Search Engine BUSY...'; Cursor := crHourglass; end; procedure TSrchForm.ProcessInputText; {find out whether a search is a 'default' search (no '+' or '*' or '?' characters in the input string) or whether the search will be an 'AND' search, a 'wildcard' search, or a 'Qmark' search...} {NOTE THAT WE DON'T FULLY VALIDATE USER INPUT HERE, and deliberately so, since we want to give users the option of looking for whatever bizarre sequence of characters takes their fancy. If input is off-the-wall-bizarre then a search will simply find no matches -no problem, but we DO look for more than one '+' character, and more than one '*' character, as such input WILL be considered invalid. Similarly, there must only be one '+', one '*', or several '?' characters in the input string...} var tempArray: array[0..maxInputLen] of Char; i, inputLen, {asteriskCount, qMarkCount,} plusCount: Integer; plusInputOK: Boolean; begin searchType := 'DEFAULT_SEARCH'; plusCount := 0; {asteriskCount := 0; qMarkCount := 0;} searchPatt := patternEdit.Text; inputLen := Length(searchPatt); {if (inputLen = 0) then begin searchType := 'INVALID_SEARCH'; end;} {if the inputted search string is longer than maxInputLen characters, reduce it's length TO maxInputLen characters...} if (inputLen >= maxInputLen) then begin for i := maxInputLen to (maxStringLen - 1) do begin searchPatt[i] := #0; {#0 is another way of saying Chr(0) aka NULL} end; end; {copy the search pattern into an array...} if (inputLen > 0) then begin for i := 1 to inputLen do begin tempArray[i - 1] := searchPatt[i]; end; tempArray[i - 1] := #0; end; {and check out some dodgy characters in the search string...} for i := 0 to (inputLen - 1) do begin if (tempArray[i] = '+') then begin searchType := 'AND_SEARCH'; Inc(plusCount); end; {IF at some future date code is amended to cater for 'wildcard' and 'question mark' searches, then use the commented-out lines below...} {if (tempArray[i] = '*') then begin searchType := 'WILD_SEARCH'; Inc(asteriskCount); end; if (tempArray[i] = '?') then begin searchType := 'QMARK_SEARCH'; Inc(qMarkCount); end;} end; if (plusCount > 1) then searchType := 'INVALID_SEARCH'; {if (asteriskCount > 1) then searchType := 'INVALID_SEARCH'; if ((plusCount > 0) and (asteriskCount > 0)) then searchType := 'INVALID_SEARCH'; if ((asteriskCount > 0) and (qMarkCount > 0)) then searchType := 'INVALID_SEARCH'; if ((qMarkCount > 0) and (plusCount > 0)) then searchType := 'INVALID_SEARCH';} {I know you can do the above with hands and oars but doing it like this may be more readable. If the search type is an 'AND search', first do one last bit of input validation, and then parse out the two halves of the text...} if (searchType = 'AND_SEARCH') then begin plusInputOK := CheckPlusInputOK; end; if (searchType = 'AND_SEARCH') then begin if (plusInputOk = True) then SeparateANDInput else searchType := 'INVALID_SEARCH'; end; if (searchType = 'DEFAULT_SEARCH') then pattLen := Length(searchPatt); if (searchType = 'INVALID_SEARCH') then begin {searchTextOK := False;} Application.MessageBox('Please check out Help for info on valid input', 'Invalid input', mb_OK); end; end; function TSrchForm.CheckPlusInputOK: Boolean; {this is a bit fiddly, but it has to be done! -Basically, we need to check whether there ARE visible characters to the left and to the right of the '+' character in the input string. Because if there aren't, we can't split the string into two substrings properly...} var i, plusPos, inputLen: Integer; visibleNLeftOK, visibleNRightOK: Boolean; begin visibleNLeftOK := False; visibleNRightOK := False; plusPos := Pos('+', searchPatt); inputLen := Length(searchPatt); if ((plusPos = 1) or (plusPos = inputLen)) then begin Result := False; end else begin for i := 1 to plusPos do begin if (searchPatt[i] in VisibleChars) then visibleNLeftOK := True; end; for i := plusPos to inputLen do begin if (searchPatt[i] in VisibleChars) then visibleNRightOK := True; end; if ((visibleNLeftOK = True) and (visibleNRightOK = True)) then Result := True else Result := False; end; end; procedure TSrchForm.SeparateANDInput; {Delphi can be funny about copying characters from string to string on a character-by-character basis so that's we we use a tempArray here...} var i, j, k, plusPos, inputLen: Integer; tempArray: array[0..maxInputLen] of Char; tempArrayOne: array[0..maxInputLen] of Char; tempArrayTwo: array[0..maxInputLen] of Char; begin plusPos := Pos('+', searchPatt); inputLen := Length(searchPatt); j := 0; {copy all characters from (and including) the '+' into tempArray...} for i := plusPos to inputLen do begin tempArray[j] := searchPatt[i]; Inc(j); end; tempArray[j] := #0; {copy all characters from UP TO the '+' into tempArrayOne...} for i := 1 to (plusPos - 1) do begin tempArrayOne[i - 1] := searchPatt[i]; end; tempArrayOne[i - 1] := #0; {at this point we know the position of the '+' in searchPatt, and we have all characters past (and including) the '+' copied into tempArray, and all characters up the '+' copied into tempArrayOne, BUT we need to take account of the fact that users might type in 'nature+man' for instance, or they might type in 'nature + man', so we need to fiddle- firstly, go BACKWARDS from plusPos in tempArrayOne, overwriting any non-visibles, including spaces, with a NULL until we come to a VISIBLE character...} i := (plusPos - 1); {first draft of this used the IsCharAlphaNumeric function but that's not such a good idea as it doesn't take account of user input that purposely includes punctuation characters (etc?)...} while (i >= 0) do begin Dec(i); if (i = 0) then break; if (not(tempArrayOne[i] in VisibleChars)) then begin tempArrayOne[i] := #0; end else break; end; {and copy this array back into the searchPatt variable...} searchPatt := String(tempArrayOne); {next, go FORWARD in tempArray to find the position of the first visible character AFTER the '+'...} for i := 1 to maxInputLen do begin if (tempArray[i] in VisibleChars) then break; end; {now copy characters from the first visible character (at index i)...} k := 0; for j := i to maxInputLen do begin tempArrayTwo[k] := tempArray[j]; if (tempArray[j] = #0) then break; Inc(k); end; tempArrayTwo[k] := #0; searchPattTwo := String(tempArrayTwo); {at this point we have extracted two different string from what WAS the searchPatt variable, so not only do we have a NEW and shorter searchPatt but we also have a searchPattTwo as well, and we need to keep track of the lengths of BOTH of these...} pattLen := Length(searchPatt); pattLenTwo := Length(searchPattTwo); end; procedure TSrchForm.SearchButtonClick(Sender: TObject); {here we do various 'pre-search' tasks- checking that everything's OK before we do the search: we check that there IS text in the 'PatternEdit' box, and we check that we DO have at least one filename already (for the 'search' aka 'source' files) -next we check that we have enough memory for the buffer. If we fail in any of these tasks we display an error message to the user. But if things are OK so far, then we call InitialiseSearch to do various things such as setting up the 'skip' array. Finally we call LoadSourceFile to load a file into the buffer- the search routine itself being called from LoadSourceFile...} var searchTextOK, fileNameOK, memoryOK: Boolean; fileRdStatus, currFileName, tempString: String; i, numFiles: Integer; exeFile: Boolean; begin searchTextOK := True; fileNameOK := True; memoryOK := True; fileRdStatus := ''; exeFile := False; ProcessInputText; if (searchPatt = '') then begin searchTextOK := False; Application.MessageBox('Please provide a search pattern in the input box provided', 'No search pattern', mb_OK); end; numFiles := fileList.Count; if (numFiles = 0) then begin fileNameOK := False; Application.MessageBox('Please choose file(s) to search in using the Search Domain button', 'No files specified', mb_OK); end; if (searchType = 'INVALID_SEARCH') then searchTextOK := False; if ((searchTextOK) and (fileNameOK))then begin memoryOK := CreateBuffer; if (memoryOK = False) then Application.MessageBox('The program does not have enough memory to play with -free up some system resources if possible', 'Not enough memory', mb_OK); end; if ((searchTextOK) and (fileNameOK) and (memoryOK)) then begin InitialiseSearch; {the 'minus 1' next line is v. important, as the filelist is zero-based...} for i := 0 to (numFiles - 1) do begin {go through the list of source files in sequence, loading each of them into memory as we call LoadSourceFile -note that the SEARCH routine itself is called from within the depths of the LoadSourceFile repeat loop...} currFileName := fileList[i]; {in line below, note that fileList indexing starts at 0 but we NEED to start at 1 in the mInfoArray (below) which is why we make sure we add 1 to currFileNum here...} currFileNum := i + 1; currFileSize := 0; tempString := ExtractFileExt(currFileName); if (tempString = '.exe') then exeFile := True; if not(exeFile = True) then begin fileRdStatus := LoadSourceFile(currFileName); {add file size to each entry in the mInfoArray for the current source file...} AddFileSize(i, currFileSize); if (not(fileRdStatus = '')) then begin tempString := 'Cannot read ' + uppercase(currFileName) + '.' + #13 + fileRdStatus + '.'; Application.MessageBox(PChar(tempString), 'Error reading file', mb_OK); end; end; {if not exe file...} end; DestroyBuffer; EndSearch; end; end; procedure TSrchForm.AddFileSize(fileNumber: Integer; fileSize: Integer); {add information on the size of the current file to the 'match info array' making sure we only add this information to entries pertaining to the current file...} var index: Integer; begin {add 1 to the fileNumber since although in the file list the first file is numbered 0, we number them starting from 1 in the mInfoArray, OK...} fileNumber := fileNumber + 1; {firstly step through the array until we find the first entry for the current file- note that each time we enter this procedure the value of fileNumber will be the number of the last set of entries in the mInfoArray...} index := 0; while (index < maxFoundMatches) do begin if (mInfoArray[index].fileNum = fileNumber) then break; Inc(index); end; {then use the value of the index variable to tell us where to START putting values into the array. The presence of a 0 in the fileNum field tells us when to STOP...} while (index < maxFoundMatches) do begin if (mInfoArray[index].fileNum = 0) then break; if (not(mInfoArray[index].fileNum = fileNumber)) then break; mInfoArray[index].fileSize := currFileSize; Inc(index); end; end; function TSrchForm.LoadSourceFile(srcFileName: String): String; {read a potentially big source text file in maxBufferSize chunks, and call the Search() routine each time we load up a new buffer-full of data...} var {declare the source file to be an UNTYPED file so that we are then able to use Seek() and BlockRead() -which we could not do if we opened the file as a 'text' file...} srcFile: file; readStatus: String; bytesRead, blocksRead, i: Integer; begin readStatus := ''; AssignFile(srcFile, srcFileName); try {to open source file} reset(srcFile,1); try {putting source file data into memory ie into the buffer} repeat bytesRead := 0; blocksRead := 0; {next line is very important (and so simple in Delphi!) -basically it allows the program to be aware of the user pressing a key to interrupt a search, if they HAVE pressed a key. Basically the ProcessMessages() method interrupts the execution of the program so that Windows can respond to events -the looked-for event in this case being a user keypress... CURRENTLY HOWEVER IF IT'S THE CASE THAT AT LEAST ONE SEARCH HAS ALREADY BEEN DONE THEN THE PROGRAM IS QUICK TO PICK UP ON THE KEYPRESS, AND INTERRUPT THE SEARCH. IF ON THE OTHER HAND IT'S THE FIRST SEARCH, THEN IT'S SLOW TO REACT -this down to the intricacies of the Windows Message Loop -see also SearchButtonKeyDown event handler below...} Application.ProcessMessages; if (stopSearch = False) then begin blockread(srcFile, Buffer^, sizeof(Buffer^), bytesRead); Inc(blocksRead); {note that we don't re-initialise the buffer each time we use it- we don't need to, and doing that would slow things down. However, on the final pass (where we grab a chunk of data that is almost certain to be less than a 'bufferfull') the REST of the buffer then will still have data in from the previous blockread, so we must erase that, but, if we also add just one space character to existing text this stops the EditLine procedure truncating the last line of text if the last search pattern character matches the last file character -anoraksville, man...} if (bytesRead < maxBufferSize) then begin Buffer^[bytesRead + 1] := ' '; for i := (bytesRead + 2) to maxBufferSize do begin {note the syntax below- the Buffer variable is a pointer type, so to reference the character array to which it points, we use the carat after the (pointer) variable name, okey dokey...} Buffer^[i] := #0; end; end; OverwriteDuffBufferChars; if (blocksRead > 1) then begin {if Buffer is full when we come to read data in, (as code here iterates in a repeat loop) then skip backwards pattLen bytes in the source file -this will ensure that we don't miss a searched-for pattern existing on a 'blockread boundary'...} seek(srcFile, filepos(srcFile)-pattLen); end; {insert code here to convert all chars to uppercase if necessary... see BMH algorithm sourcecode DelphiOne/Temp/Nine dir...} if (caseInSensitive = True) then CapitaliseBuffer; {call the appropriate search routine to search the current buffer, which exists as a 'global' variable (so [of course] we don't need to hand the buffer to the search routine as a parameter)...} if (searchType = 'DEFAULT_SEARCH') then DefaultSearch; if (searchType = 'AND_SEARCH') then ANDSearch; if (blocksRead > 1) then begin {each time we read a block, add the last 'bytesRead' value to the currFileSize variable so that we can i) find the true file pos for a match by adding the search routine (current buffer) pointer to currFileSize, and ii) when we finish reading a file the value of currFileSize is then the TOTAL file, so we can put i) and ii) values into the mInfoArray, BUT (are you listening carefully?) BECAUSE we backtrack in the sourcefile pattLen characters on every blockread after the first one, sIdx does not then reflect the 'true' filePosition, which will in fact be pattLen bytes LESS than sIdx plus the sum of prior blockreads, so we must cater for this, below} currFileSize := ((currFileSize + bytesRead) - pattLen); end else begin currFileSize := currFileSize + bytesRead; end; end; until ((bytesRead < maxBufferSize) or (bytesRead = 0) or (numMatches >= maxFoundMatches) or (stopSearch = True)); finally closefile(srcFile) end; {putting source file data into memory} except on E: EInOutError do begin readStatus := GetError(E.ErrorCode); end end; {trying to open source file} {if there hasn't been an error reading the source file then return the empty string in readStatus. If there HAS been an error then return an error message string...} Result := readStatus; end; function TSrchForm.GetError (const ErrorCode: integer): string; {return a string pertaining to the type of error. If IO-checking was off we could check for errors by looking at IOResult, but in this program we use an exception handler (in the file reading routine above) instead. The strings listed below are taken from Borland's 'Object Pascal Language Guide' for Delphi Version 1.0, pages 273-275...} begin case ErrorCode of 2: Result := 'File not found'; 3: Result := 'Path not found'; 4: Result := 'Too many open files'; 5: Result := 'File access denied'; 6: Result := 'Invalid file handle'; 12: Result := 'Invalid file access code'; 15: Result := 'Invalid drive'; 100: Result := 'Disk read error'; 101: Result := 'Disk write error'; 102: Result := 'File not assigned'; 103: Result := 'File not open'; else Result := '' end end; procedure TSrchForm.OverwriteDuffBufferChars; {go right through the current buffer overwriting any characters we are sure to consider undesirable with space characters. Doing this here, before we do anything else, is good, since we are making sure that buffers don't have any weird unprintable stuff in that might screw things up later on. So first populate the allowableChars set variable with ALLOWABLE characters, which may be either from the 'normal' ASCII range of 0 to 127 (from space to '~') and which may also be from the 'extended' ASCII range of 128 to 255 (all the 'visibles' in that range). Note that while a routine such as this will be a good idea in a program such as this, the definition of 'undesirable' is likely to change from program to program depending on specific aims. Note that it MAY be important that we ALLOW CR and LF chars here...} type TallowableChars = set of Char; var allowableChars: TallowableChars; i: LongInt; begin allowableChars := [Chr(10), Chr(13), Chr(32)..Chr(126), Chr(145)..Chr(151), Chr(161)..Chr(174), Chr(176)..Chr(181),Chr(183)..Chr(255)]; for i := 1 to maxBufferSize do begin if not(buffer[i] in allowableChars) then buffer[i] := ' '; end; end; procedure TSrchForm.DefaultSearch; {use a variation on the Boyer-Moore (aka 'mismatched character') algorithm to search the current buffer for strings that match the user-input 'search pattern' string. Where a match is found, extract chunkLen characters from AROUND the 'match' position in the source file, and write them to a memo component...} var sIdx, sIdxTmp, pIdx: Longint; matchFound: Boolean; begin matchFound := True; {sIdx is a 'sourceIndex' variable used to keep track of our position within the source file. Note that initially we set it to pattLen, (remembering that in the buffer the first byte has an index of 1) so that at first we look for a match pattLen characters into the source text, NOT at the beginning as one might at first expect...} sIdx := pattLen; {do main search loop -cleverer than it might appear at first glance..!} repeat matchFound := True; sIdxTmp := sIdx; for pIdx := pattLen downto 1 do begin if (buffer[sIdxTmp] = searchPatt[pIdx]) then begin Dec(sIdxTmp); end else begin matchFound := False; break; end; end; if (matchFound = True) then begin GetChunk(sIdxTmp); mInfoArray[numMatches].fileNum := currFileNum; mInfoArray[numMatches].filePos := currFileSize + sIdxTmp; Inc(numMatches); sIdx := sIdx + pattLen; end else begin sIdx := (sIdx + skip[ord(buffer[sIdx])]); end; until ((sIdx >= maxBufferSize) or (numMatches >= maxFoundMatches)); end; procedure TSrchForm.ANDSearch; {do a search where user input is in the form of TWO 'search pattern' variables. Although it would be possible to adapt the Boyer-Moore algorithm such that an amended BM technique uses the same method to search for both patterns, here (for simplicity's sake) we use the Boyer-Moore method to search for the first pattern, and then if we find a match, we do a 'brute-force' search backwards and forwards from that position to look for an instance of the second pattern (going backwards or forwards only ANDsearchWidth characters) -OK?...} var sIdx, sIdxTmp, pIdx, w, x, y, z, bruteIdx, bruteEndPos: Longint; firstMatchFound, assumedMatch, secondMatchFound, bufEnd: Boolean; tempPattArray: array[0..maxInputLen] of Char; begin firstMatchFound := True; secondMatchFound := False; assumedMatch := True; bufEnd := False; sIdx := pattLen; {firstly copy the SECOND search pattern string into an array...} for x := 1 to (pattLenTwo) do begin tempPattArray[x - 1] := searchPattTwo[x]; end; tempPattArray[x - 1] := #0; repeat FirstMatchFound := True; sIdxTmp := sIdx; for pIdx := pattLen downto 1 do begin if (buffer[sIdxTmp] = searchPatt[pIdx]) then begin Dec(sIdxTmp); end else begin firstMatchFound := False; break; end; end; if (firstMatchFound = True) then begin {at this point we have found a match on the first pattern, with the position of the first source file matched character held in sIdxTmp- so firstly go back ANDsearchWidth places (while avoiding backtracking past the start of the file/buffer)...} if (sIdxTmp <= ANDsearchWidth) then bruteIdx := 1; if (sIdxTmp >= ANDsearchWidth) then bruteIdx :=(sIdxTmp - ANDsearchWidth); {now (assuming we don't go forward past the end of the file/buffer) we simply look for a match against the second pattern by going forward ((2 x ANDsearchWidth) + pattLen) places. (Ignoring the complication arising from going back less than ANDsearchWidth places from match pos if we are near the start of the file, as that doesn't really matter)...} bruteEndPos := (bruteIdx + (ANDsearchWidth * 2) + pattLen); x := bruteIdx; {if you didn't know, the brute-force search algorithm works like this: i) from pos n in source file compare bytes one at a time against bytes in searchPatt ii) if the search fails, then move source file position (n) one place forward and do the same thing again...} while ((x <= bruteEndPos) and (bufEnd = False) and (secondMatchFound = False)) do begin y := x; z := 0; while (assumedMatch = True) do begin if not(tempPattArray[z] = buffer[y]) then assumedMatch := False else begin Inc(z); Inc(y); end; if (z = pattLenTwo) then secondMatchFound := True; if (y >= maxBufferSize) then bufEnd := True; end; Inc(x); assumedMatch := True; end; if (secondMatchFound = True) then begin {at this point we should (WILL!) have found a match on searchPattTwo...} GetChunk(sIdxTmp); mInfoArray[numMatches].fileNum := currFileNum; mInfoArray[numMatches].filePos := currFileSize + sIdxTmp; Inc(numMatches); end; secondMatchFound := False; sIdx := sIdx + pattLen; end else begin sIdx := (sIdx + skip[ord(buffer[sIdx])]); end; until ((sIdx >= maxBufferSize) or (numMatches >= maxFoundMatches)); end; procedure TSrchForm.CapitaliseBuffer; {Perform our own DIY conversion routine. Careful examination of characters in the range 0 to 255 reveals that there are two uninterrupted sequences of lowercase characters that have uppercase equivalents (and vice versa). One sequence is in the lower ('ASCII') range (ie 'a' to 'z') -the other sequence is in the higher range (ie à to ı). Also, for both sequences the uppercase equivalent of a character is 32 characters LESS than the lowercase version. So, therefore, the algorithm will be: IF character in range 97 to 122 inclusive, OR if character in range 224 to 253 inclusive, then character = the character 32 characters 'behind'...} var i: Integer; begin for i := 1 to sizeof(Buffer^) do begin if (((Ord(Buffer^[i]) >= 97) and (Ord(Buffer^[i]) <= 122))) then begin Buffer^[i] := Chr(Ord(Buffer^[i]) - 32); end; if (((Ord(Buffer^[i]) >= 224) and (Ord(Buffer^[i]) <= 253))) then begin Buffer^[i] := Chr(Ord(Buffer^[i]) - 32); end; end; {we might be scanning through NULLs at the end of the last buffer but this doesn't matter...} end; procedure TSrchForm.GetChunk(bufPos: LongInt); {extract a chunk of source file data around the position of the match and edit that line, finally adding it to the the list of found match lines (extent as a Memo component)...} var i: Integer; srcChunk: array[0..chunkLen] of Char; tempString: String; firstSpacePos: LongInt; firstSrcWord: Boolean; begin firstSrcWord := False; {initialise both the 'chunk' array and 'tempString'...} for i := 0 to (chunkLen - 1) do begin srcChunk[i] := #0; end; tempString := ''; {the following works around the fact that the EditLine routine WOULD remove a match word if it was at the very first position in the source file unless we add the space that we do here -note that IF bufPos is SOMEHWERE in the middle of the first word (ie if the value of BufPos is less than the position of the first space) then we DON'T handle that -attempting to do so creates problems of it's own...} if (bufPos = 0) then begin firstSrcWord := True; end; {extract text from the source file, going backwards HALF chunkLen characters before the match, and HALF chunkLen characters after it, placing characters into the srcChunk variable, which is a character array (whose indexing therefore starts at 0)... note also the next two lines to ensure that we don't try and backtrack past the START of the buffer (which would surely cause havoc)...} if (bufPos <= halfaChunk) then bufPos := 1; if (bufPos >= halfaChunk) then bufPos := (bufPos - halfaChunk); if (firstSrcWord = True) then begin srcChunk[0] := ' '; i := 1; end else begin i := 0; end; while (i < chunkLen) do begin {avoid putting unnecc. line feed/return characters into memo lines...} if ((buffer[bufPos] = #10) or (buffer[bufPos] = #13)) then begin {replace line feed/return characters with spaces...} srcChunk[i] := ' '; Inc(i); end else begin srcChunk[i] := buffer[bufPos]; Inc(i); end; {make sure we don't go past the end of srcChunk...} if (i >= (chunkLen - 1)) then break; Inc(bufPos); {make sure we don't go past the end of the buffer...} if (bufPos >= maxBufferSize) then break; end; {while} {terminate the srcChunk array with a NULL character...} srcChunk[i] := #0; tempString := EditLine(srcChunk, i); ResultsForm.LinesMemo.Lines.Add(tempString); end; function TSrchForm.EditLine(tempLine: array of Char; i: Integer): String; {this function takes as a parameter a char array which is a single line (aka 'chunk') of source file text. The 'i' parameter is the length of the line. At this point, however, we may have 'part-words' at the beginning and end of the line, so this routine strips away these unwanted characters to produce lines that start and end with whole words. The resulting 'edited' line is then returned as a string...} var j, n: Integer; tempArray: array[0..chunkLen] of Char; tempString: String; begin {firstly go forward in the sourcefile 'chunk' until you get to a space...} j := i; i := 0; while not(tempLine[i] = ' ') do begin Inc(i); if (i >= j) then break; end; Inc(i); n := 0; {now put the whole of what remains into tempArray...} while (i <= j) do begin tempArray[n] := tempLine[i]; Inc(i); Inc(n); end; {and then backtrack from the END of temparray, until you get to a space, placing a NULL character (#0) in the last 'non-space' position...} while not(tempArray[n] = ' ') do begin Dec(n); if (n <= 0) then break; end; tempArray[n] := #0; tempString := string(tempArray); Result := tempString; end; procedure TSrchForm.SearchButtonKeyDown(Sender: TObject; var Key: Word; Shift: TShiftState); {if the user has initiated a search (by clicking on the Search button with the mouse) then the focus remains on the button as the search is performed- SO, if the user wants to interrupt a search, then all we need to do here is look for the ESC key being pressed: if ESC has been pressed, then we set a global variable which the LoadSourceFile() routine keeps tabs on from time to time...} begin {code here uses the 'virtual key code' for the ESC key...} if (Key = VK_ESCAPE) then stopSearch := True; end; procedure TSrchForm.SDomainButtonClick(Sender: TObject); begin SDForm.SetSearchDomain; {next line is necessary to ensure that we don't continually add to the fileList- it's OK to clear it since when we call GetFilesNames (as below) we can be sure that the fileList will be updated there with a CURRENT list of 'search domain' filenames...} fileList.Clear; SDForm.GetFileNames; {the two (re)-initialisations below preclude the possibility of the program confusing file 1 in a previously-chosen 'page' window with file 1 of a newly-defined search DOMAIN...} ResultsForm.currLoadFile := 0; ResultsForm.prevLoadFile := 0; InitMInfoArray; end; procedure TSrchForm.EndSearch; var i, numMatches: Integer; tempString: String; begin numMatches := ResultsForm.ProcessMInfoMemo; if (numMatches = 0) then begin if (searchType = 'AND_SEARCH') then tempString := 'No matches found for ' + searchPatt + ' and ' + searchPattTwo else tempString := 'No matches found for ' + searchPatt; Application.MessageBox(PChar(tempString), 'Search results', mb_OK); end else begin if (caseInSensitive = True) then begin for i := 0 to ResultsForm.LinesMemo.Lines.Count do begin {having all uppercase text in the memo doesn't look too hot so lowercase everything instead...} ResultsForm.LinesMemo.Lines[i] := AnsiLowerCase(ResultsForm.LinesMemo.Lines[i]); end; end; ResultsForm.CaptionResultsForm; ResultsForm.Show; end; SrchForm.Caption := ' Search Engine SEARCH COMPLETED'; Cursor := crDefault; end; procedure TSrchForm.HelpButtonClick(Sender: TObject); {run the Helpfile for the application -as it stands, the Helpfile must be in the STARTUP directory along with the associated contents file, so both Search.hlp and Search.cnt must be present...} var exeFileName, pathToExe, fileName: String; begin {because changing the directory in the file List box changes the overall 'current directory' such that the program won't find the Help file if we're not careful, we need to keep track of the path to the .exe (and therefore the .hlp) WHEN THE PROGRAM FIRST RUNS...} exeFileName := Application.ExeName; pathToExe := ExtractFilePath(exeFileName); fileName := pathToExe + helpFileName; Application.HelpFile := fileName; Application.HelpCommand(HELP_CONTENTS, 0); end; procedure TSrchForm.ReadINIFile; {read this module's INI file, which will be in the Windows directory...} var thisModuleIni: TIniFile; begin thisModuleIni := TIniFile.Create(iniFileName); with thisModuleIni do begin try SrchForm.Left := ReadInteger('SrchFormPos', 'Left', 0); SrchForm.Top := ReadInteger('SrchFormPos', 'Top', 0); {SrchForm.Width := ReadInteger('SrchFormPos', 'Width', 0); SrchForm.Height := ReadInteger('SrchFormPos', 'Height', 0);} finally {free up memory used in 'creating' the INI file...} Free; end; end; end; procedure TSrchForm.WriteINIFile; var thisModuleIni: TIniFile; begin thisModuleIni := TIniFile.Create(iniFileName); with thisModuleIni do begin try WriteInteger('SrchFormPos', 'Left', SrchForm.Left); WriteInteger('SrchFormPos', 'Top', SrchForm.Top); {WriteInteger('SrchFormPos', 'Width', SrchForm.Width); WriteInteger('SrchFormPos', 'Height', SrchForm.Height);} finally {free up memory used in creating the INI file (again)...} Free; end; end; end; end.