1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
| import os
import os.path
from whoosh import index
from whoosh import scoring
from whoosh import highlight
from whoosh.fields import ID, TEXT,KEYWORD, Schema
from whoosh.reading import TermNotFound
from whoosh.qparser import QueryParser
def createSearchableData(indexdir, root):
'''
Schema definition: title(name of file), path(as ID), content(indexed
but not stored),textdata (stored text content)
'''
schema = Schema(title=TEXT(stored=True),path=ID(stored=True),\
content=TEXT,textdata=KEYWORD(stored=True))
if not os.path.exists(indexdir):
os.mkdir(indexdir)
# Creating a index writer to add document as per schema
ix = index.create_in(indexdir,schema)
writer = ix.writer()
filepaths = [os.path.join(root,i) for i in os.listdir(root)]
for path in filepaths:
fp = open(path,'r', encoding='utf-8')
print(path)
text = fp.read()
writer.add_document(title=os.path.basename(path), path=path,\
content=text,textdata=text)
fp.close()
writer.commit()
return ix
if __name__ == '__main__':
#init
my_index_path = "d:/temp/whoosh_dir/"
search_dir = "d:/temp/txt_files/"
iy = createSearchableData(my_index_path, search_dir)
#Recherche
query_str = "C++"
my_return_string = ""
# iy = index.open_dir(my_index_path)
with iy.searcher(weighting=scoring.Frequency) as searcher:
qp = QueryParser('textdata', schema=iy.schema)
query = qp.parse(query_str)
print(query)
results = searcher.search(query,limit=None)
print(results)
for hit in results:
print(" * {}".format(hit['title'])) |
Partager