Menu

[8f66b6]: / fulltextindex / FullTextIndex.py  Maximize  Restore  History

Download this file

244 lines (211 with data), 11.1 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
# -*- coding: utf-8 -*-
"""
Copyright (C) 2011 Oliver Tengler
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Lesser General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU Lesser General Public License for more details.
You should have received a copy of the GNU Lesser General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
import sqlite3
import threading
from typing import List, Tuple, Iterable, Any, Dict, cast, Callable
from tools.FileTools import fopen
from .IndexDatabase import IndexDatabase
from .FileSearch import searchFile
from .Query import ContentQuery, FileQuery, PerformanceReport, ReportAction, safeLen, SearchResult
def intersectSortedLists(l1: List[str], l2: List[str]) -> List[str]:
l = 0
r = 0
l3 = []
try:
itemL = l1[l]
itemR = l2[r]
while True:
if itemL < itemR:
l += 1
itemL = l1[l]
elif itemL > itemR:
r += 1
itemR = l2[r]
else:
l3.append(itemL)
l += 1
r += 1
itemL = l1[l]
itemR = l2[r]
except IndexError:
pass
return l3
class Keyword:
def __init__(self, identifier: int, name: str) -> None:
self.id = identifier
self.name = name
def __repr__(self) -> str:
return "%s (%u)" % (self.name, self.id)
def __eq__(self, other: object) -> bool:
if not type(other) is Keyword:
return False
other = cast(Keyword, other)
return self.id == other.id and self.name == other.name
KeywordList = List[List[Keyword]]
CommonKeywordMap = Dict[str,int]
def buildMapFromCommonKeywordFile(name:str) -> CommonKeywordMap:
mapCommonKeywords = {}
if name:
with fopen(name, "r") as inputFile:
for number, keyword in ((number, keyword.strip().lower()) for number, keyword in enumerate(inputFile)):
if keyword:
mapCommonKeywords[keyword] = number
return mapCommonKeywords
def cancelableSearch(func: Callable[..., SearchResult], *args: Any) -> SearchResult:
try:
return func(*args)
except sqlite3.OperationalError as e:
if str(e) == "interrupted":
return []
raise
class FullTextIndex (IndexDatabase):
def searchFile(self, query: FileQuery, perfReport: PerformanceReport=None, cancelEvent:threading.Event=None) -> SearchResult:
return cancelableSearch(self.__searchFile, query, perfReport)
def __searchFile(self, query: FileQuery, perfReport: PerformanceReport=None) -> SearchResult:
q = self.conn.cursor()
return searchFile(q, query, perfReport)
# commonKeywordMap maps keywords to numbers. A lower number means a worse keyword. Bad keywords are very common like "h" in cpp files.
def searchContent(self, query: ContentQuery, perfReport: PerformanceReport=None, commonKeywordMap: CommonKeywordMap=None,
manualIntersect: bool=True, cancelEvent:threading.Event=None) -> SearchResult:
return cancelableSearch(self.__searchContent, query, perfReport, commonKeywordMap, manualIntersect, cancelEvent)
def __searchContent(self, query: ContentQuery, perfReport: PerformanceReport=None, commonKeywordMap:CommonKeywordMap=None, manualIntersect:bool=True, cancelEvent:threading.Event=None) -> SearchResult:
if not isinstance(query, ContentQuery):
raise RuntimeError("query must be a ContentQuery derived object")
perfReport = perfReport or PerformanceReport()
commonKeywordMap = commonKeywordMap or {}
q = self.conn.cursor()
# The result is a list of lists of Keyword objects
kwList: KeywordList = []
with perfReport.newAction("Finding keywords") as action:
kwList = self.__getKeywords(q, query.indexedPartsLower(), reportAction=action)
if not kwList:
return []
goodKeywords, badKeywords = self.__qualifyKeywords(kwList, commonKeywordMap)
with perfReport.newAction("Finding documents") as action:
if not manualIntersect:
result = self.__findDocsByKeywords(q, goodKeywords, badKeywords)
else:
result = self.__findDocsByKeywordsManualIntersect(q, goodKeywords, badKeywords, action)
action.addData("%u matches", safeLen(result))
if not result:
return []
if query.requiresRegex():
with perfReport.newAction("Filtering results") as action:
return self.__filterDocsBySearchPhrase(action, (r[0] for r in result), query, cancelEvent)
else:
with perfReport.newAction("Returning results"):
if not query.folderFilter and not query.extensionFilter:
return [r[0] for r in result]
return [r[0] for r in result if query.matchFolderAndExtensionFilter(r[0])]
return []
def __findDocsByKeywords(self, q: sqlite3.Cursor, goodKeywords: KeywordList, badKeywords: KeywordList) -> SearchResult:
kwList = goodKeywords + badKeywords
stmt = ""
for keywords in kwList:
if stmt:
stmt += " INTERSECT "
inString = ",".join((str(keyword.id) for keyword in keywords))
stmt += "SELECT DISTINCT fullpath FROM kw2doc,documents WHERE docID=id AND kwID IN (%s)" % (inString, )
q.execute(stmt + " ORDER BY fullpath")
return q.fetchall()
def __findDocsByKeywordsManualIntersect(self, q: sqlite3.Cursor, goodKeywords: KeywordList, badKeywords: KeywordList, reportAction: ReportAction) -> SearchResult:
result: SearchResult = []
allKeywords = [(True, keywords) for keywords in goodKeywords] + [(False, keywords) for keywords in badKeywords]
for isGood, keywords in allKeywords:
# Stop if all good keywords have been used and the result is stripped down to less than 100 files
if not isGood:
kwNames = ",".join((keyword.name for keyword in keywords))
if result and len(result) < 100:
reportAction.addData("Search stopped with common keyword '%s'", kwNames)
break
else:
if result:
reportAction.addData("Common keyword '%s' used because %u matches are too much", kwNames, len(result))
else:
reportAction.addData("Common keyword '%s' used as first keyword", kwNames)
inString = ",".join((str(keyword.id) for keyword in keywords))
stmt = "SELECT DISTINCT fullpath FROM kw2doc,documents WHERE docID=id AND kwID IN (%s) ORDER BY fullpath" % (inString, )
q.execute(stmt)
kwMatches = q.fetchall()
if not result:
result = kwMatches
else:
result = intersectSortedLists(result, kwMatches)
if not result:
return []
return result
def __filterDocsBySearchPhrase(self, action: ReportAction, results: Iterable[str], query: ContentQuery, cancelEvent: threading.Event=None) -> SearchResult:
finalResults = []
reExpr = query.regExForMatches()
action.addData("RegEx: %s", reExpr.pattern)
bHasFilters = query.folderFilter or query.extensionFilter
for fullpath in results:
if bHasFilters:
if not query.matchFolderAndExtensionFilter(fullpath):
continue
try:
with fopen(fullpath) as inputFile:
if reExpr.search(inputFile.read()):
finalResults.append(fullpath)
except:
pass
if cancelEvent and cancelEvent.is_set():
return []
return finalResults
# Receives a list of lists of Keywords and returns as two lists.
# The first list contains the good keywords in input order. These are keywords which are not found if commonKeywordMap.
# The second list contains the bad keywords which were found in commonKeywordMap ordered from the less worst to the worst.
def __qualifyKeywords(self, kwList: KeywordList, commonKeywordMap: CommonKeywordMap) -> Tuple[KeywordList,KeywordList]:
goodKeywords = []
badKeywordsTemp = [] # contains touples (quality,keywords) in order to sort by quality
for keywords in kwList:
# Check if one of the keywords is found in mapCommonKeywords
quality = None
for keyword in keywords:
if keyword.name in commonKeywordMap:
q = commonKeywordMap[keyword.name]
if not quality or q < quality:
quality = q
if quality is None:
# Not in the common keyword map
goodKeywords.append(keywords)
else:
# In the common keyword map
badKeywordsTemp.append((quality, keywords))
badKeywords = [keywords for unusedQuality, keywords in sorted(badKeywordsTemp, reverse=True)]
# Sort good keywords by length descending, the hope is that longer keywords are more unique
return sorted(goodKeywords,reverse=True,key=len), badKeywords
# Receives a list of keywords which might contain wildcards. For every passed keyword a list of Keyword objects
# is returned. If a keyword is not found an empty list is returned.
def __getKeywords(self, q: sqlite3.Cursor, keywordList: Iterable[str], reportAction: ReportAction=None) -> KeywordList:
keys = []
for kw in keywordList:
query = "SELECT id,keyword FROM keywords WHERE"
if kw.find("*") != -1:
query += " keyword LIKE ? ESCAPE '!'"
kw = kw.replace("_", "!_")
kw = kw.replace("*", "%")
else:
query += " keyword=?"
q.execute(query, (kw, ))
result = q.fetchall()
if not result:
if reportAction:
reportAction.addData("String '%s' was not found", kw)
return []
if reportAction:
reportAction.addData("String '%s' results in %u keyword matches", kw, len(result))
keys.append([Keyword(r[0], r[1]) for r in result])
return keys
Want the latest updates on software, tech news, and AI?
Get latest updates about software, tech news, and AI from SourceForge directly in your inbox once a month.