Delphi IFilter Implementation

I need to implement IFilter in Delphi 2010, which can search for Office 2007 docx files and return the text found in the document.

Ifilter must also use the IPersistStream interface.

thanks

+4
source share
2 answers

You do not want to implement IFilter for the analysis of Office 2007 docx. You want to use Microsoft already written IFilter objects , so you can learn the contents of the file docx .

Then you use the standard IFilter mechanisms to analyze the contents of the file:

 procedure TForm1.ProcessFile(filename: string); var Filter: IFilter; hr: HRESULT; chunk: PSTAT_CHUNK; // attr: FULLPROPSPEC; flags: ULONG; c: Cardinal; buffer: WideString; begin Log('Processing "'+filename+'"'); Log('Calling LoadIFilter'); filter := LoadIFilter(filename); if filter = nil then begin Log('filter is null; leaving'); Exit; end; try Log('Calling filter.Init(IFILTER_INIT_INDEXING_ONLY)'); hr := filter.Init(IFILTER_INIT_INDEXING_ONLY, 0, nil, flags); OleCheck(hr); Log('Init returned sucessfully, looking for chunks...'); while True do begin New(chunk); try hr := filter.GetChunk(chunk); if Failed(hr) then begin Log('No more chunks: '+IntToHex(hr, 8)+' ('+GetChunkHresultToStr(hr)+')'); Break; end; Log('== Got chunk. ChunkType='+IntToStr(chunk.flags)+' (1=text, 2=value) =='); if (chunk.Flags and CHUNK_TEXT) = CHUNK_TEXT then begin c := 2048; SetLength(buffer, c); hr := filter.GetText(c, PWideChar(buffer)); if Succeeded(hr) then begin Log('=== Got text ==='); SetLength(buffer, c); Log(buffer); while Succeeded(hr) do begin c := 2048; SetLength(buffer, c); hr := filter.GetText(c, PWideChar(buffer)); if Succeeded(hr) then begin SetLength(buffer, c); Log('==== Really long chunk, here' the next 2048 characters ===='); Log(buffer); end; end; end else begin Log('Could not get text from chunk: '+IntToHex(hr, 8)+' ('+GetChunkHResultToStr(hr)+')'); Log(' It might be a "Value" chunk, meaning i should call filter.GetValue rather than filter.GetText. But i''m too lazy'); end; end else if (chunk.flags and CHUNK_VALUE) = CHUNK_VALUE then begin Log('This is a "VALUE" chunk. i''m not going to read anything out of it cause it' too hard :('); end else Log('Unknown chunk type'); finally Dispose(chunk); end; end; //end while true getting chunks finally filter := nil; end; end; 

If Windows already provides code that loads IFilter for the specified file name:

 function TForm1.LoadIFilter(const filename: WideString): IFilter; var hr: HRESULT; unk: IUnknown; begin hr := ntQuery.LoadIFilter(PWideChar(filename), nil, unk); OleCheck(hr); Result := unk as IFilter; end; 

IFilter ad unit:

 unit Filter; interface uses Windows, SysUtils, Classes, ActiveX; type IFILTER_INIT = TOleEnum; const IFILTER_INIT_CANON_PARAGRAPHS = 1; IFILTER_INIT_HARD_LINE_BREAKS = 2; IFILTER_INIT_CANON_HYPHENS = 4; IFILTER_INIT_CANON_SPACES = 8; IFILTER_INIT_APPLY_INDEX_ATTRIBUTES = 16; IFILTER_INIT_APPLY_OTHER_ATTRIBUTES = 32; IFILTER_INIT_INDEXING_ONLY = 64; IFILTER_INIT_SEARCH_LINKS = 128; type IFILTER_FLAGS = TOleEnum; const IFILTER_FLAGS_OLE_PROPERTIES = 1; type CHUNKSTATE = TOleEnum; const CHUNK_TEXT = $01; CHUNK_VALUE = $02; type CHUNK_BREAKTYPE = TOleEnum; const CHUNK_NO_BREAK = 0; CHUNK_EOW = 1; CHUNK_EOS = 2; CHUNK_EOP = 3; CHUNK_EOC = 4; type FILTERREGION = packed record idChunk: ULONG; cwcStart: ULONG; cwcExtent: ULONG; end; tagFILTERREGION = FILTERREGION; const PRSPEC_LPWSTR = 0; PRSPEC_PROPID = 1; type PROPID = ULONG; type PROPSPEC = packed record ulKind: ULONG; case integer of 0: (prid: PROPID); 1: (lpws: PWideChar); end; tagPROPSPEC = PROPSPEC; type FULLPROPSPEC = packed record guidPropSet: TGUID; psProperty: PROPSPEC; end; tagFULLPROPSPEC = FULLPROPSPEC; PFULLPROPSPEC = ^FULLPROPSPEC; type STAT_CHUNK = packed record idChunk: ULONG; breakType: CHUNK_BREAKTYPE; flags: CHUNKSTATE; locale: LCID; attribute: FULLPROPSPEC; idChunkSource: ULONG; cwcStartSource: ULONG; cwcLenSource: ULONG; end; tagSTAT_CHUNK = STAT_CHUNK; PSTAT_CHUNK = ^STAT_CHUNK; // From filtererr.h const FILTER_E_END_OF_CHUNKS = HRESULT($80041700); // // MessageId: FILTER_E_NO_MORE_TEXT // // MessageText: // // No more text available in chunk. // const FILTER_E_NO_MORE_TEXT = HRESULT($80041701); // // MessageId: FILTER_E_NO_MORE_VALUES // // MessageText: // // No more property values available in chunk. // const FILTER_E_NO_MORE_VALUES = HRESULT($80041702); // // MessageId: FILTER_E_ACCESS // // MessageText: // // Unable to access object. // const FILTER_E_ACCESS = HRESULT($80041703); // // MessageId: FILTER_W_MONIKER_CLIPPED // // MessageText: // // Moniker doesn't cover entire region. // const FILTER_W_MONIKER_CLIPPED = HRESULT($80041704); // // MessageId: FILTER_E_NO_TEXT // // MessageText: // // No text in current chunk. // const FILTER_E_NO_TEXT = HRESULT($80041705); // // MessageId: FILTER_E_NO_VALUES // // MessageText: // // No values in current chunk. // const FILTER_E_NO_VALUES = HRESULT($80041706); // // MessageId: FILTER_E_EMBEDDING_UNAVAILABLE // // MessageText: // // Unable to bind IFilter for embedded object. // const FILTER_E_EMBEDDING_UNAVAILABLE = HRESULT($80041707); // // MessageId: FILTER_E_LINK_UNAVAILABLE // // MessageText: // // Unable to bind IFilter for linked object. // const FILTER_E_LINK_UNAVAILABLE = HRESULT($80041708); // // MessageId: FILTER_S_LAST_TEXT // // MessageText: // // This is the last text in the current chunk. // const FILTER_S_LAST_TEXT = HRESULT($00041709); // // MessageId: FILTER_S_LAST_VALUES // // MessageText: // // This is the last value in the current chunk. // const FILTER_S_LAST_VALUES = HRESULT($0004170A); // // MessageId: FILTER_E_PASSWORD // // MessageText: // // File was not filtered due to password protection. // const FILTER_E_PASSWORD = HRESULT($8004170B); // // MessageId: FILTER_E_UNKNOWNFORMAT // // MessageText: // // The document format is not recognized by the flter. // const FILTER_E_UNKNOWNFORMAT = HRESULT($8004170C); const IID_IFilter: TGUID = '{89BCB740-6119-101A-BCB7-00DD010655AF}'; type IFilter = interface(IUnknown) ['{89BCB740-6119-101A-BCB7-00DD010655AF}'] function Init(grfFlags: ULONG; cAttributes: ULONG; aAttributes: PFULLPROPSPEC; out pFlags: ULONG): HResult; stdcall; function GetChunk(pStat: PSTAT_CHUNK): HResult; stdcall; function GetText(var pcwcBuffer: ULONG; awcBuffer: PWideChar): HResult; stdcall; function GetValue(out ppPropValue: PROPVARIANT): HResult; stdcall; function BindRegion(origPos: FILTERREGION; riid: TGUID; out ppUnk): HResult; stdcall; end; implementation end. 
+5
source

If you are looking for old Borland / CodeGear newsgroups, you can find links to the IFilter implementation on "Soluciones Vulcano", which links to develop.shorterpath.com , which still exists. In addition, I have never seen any other implementation component, and I have not yet been able to see it myself.

0
source

Source: https://habr.com/ru/post/1315741/


All Articles