说明
周末闲来无事花点时间,基于Lucene倒排索引的思想,使用Python简单实现了索引文档与短语搜索的小功能,目的是帮助快速理解倒排索引的写入与查询的基本思想。简单的小程序
# -*- coding: utf-8 -*-
"""
File Name: indexingAndSearch.py
Author: Donny.fang
Date: 2021/1/16 18:12
Desc: Python3.8
"""
import logging
from collections import deque
LOG_FORMAT = "[%(asctime)s][%(levelname)s ] %(message)s"
DATE_FORMAT = "%Y-%m-%d %H:%M:%S"
logging.basicConfig(level=logging.INFO, format=LOG_FORMAT, datefmt=DATE_FORMAT)
class TermTrie(object):
"""
如下采用dict存储结点对象,Trie树的存储结构如下:
{
"a":{
"p":{
"p":{
"#":"#",
"app":[
0
],
"l":{
"e":{
"#":"#",
"apple":[
0,
1
],
"e":{
"#":"#",
"applee":[
1
]
}
}
}
}
}
}
}
"""
def __init__(self):
self.root = {}
self.end_of_word = '#'
def insert(self, term, docId):
"""
Inserts a term into the trie.
:type term: str
:type docId: int
:rtype: None
"""
node = self.root
for c in term:
node = node.setdefault(c, {})
node.setdefault(term, []).append(docId)
node[self.end_of_word] = self.end_of_word
def getDocIdsByTerm(self, term):
"""
Returns docIds by term
:type term: str
:rtype: list
"""
node = self.root
docIds = []
for c in term:
if c not in node:
break
node = node[c]
if self.end_of_word in node:
docIds = node.get(term, [])
return docIds
class IndexingSearch(object):
"""
简单功能描述:
1. 从文件中逐行的读取数据,并对每行数据以空格的方式进行分词,形成terms
2. 将形成的terms分别装进字典树中,并存储每个term对应的docIds
3. 执行短语搜索时,同样对phrase以空格的方式进行分词,形成terms
4. 针对每个term,获取到满足条件的docIds,并获取最终的Document
"""
def __init__(self):
self.docs, self.trie = [], TermTrie()
@staticmethod
def generateTerms(document):
"""
以空格的方式对document进行分词,比如"hello world" ---> "hello", "world"
:param document: a line text
:return: list type, like ["hello", "world"]
"""
assert isinstance(document, str)
if len(document.strip()) < 1:
logging.error("document is empty")
return document.strip().split()
def indexingDocument(self, fileName):
"""
索引文档,比如document:"hello world"(假设document id为0),依据空格分词,会形成hello与world两个term;
对于每个term(hello and world),将其装载进字典树中,然后记录term对应的docID;比如hello ---> [0], world --->[0]
:param fileName: file for indexing
:return: trie
"""
docId = 0
with open(fileName, "r") as fr:
while 1:
doc = fr.readline().strip()
if not doc: # 假设document之间无空行,此处表示遍历到文件末尾,结束
break
terms = IndexingSearch.generateTerms(doc)
for term in terms:
self.trie.insert(term, docId)
docId += 1
# 将从文件中遍历得到的每个document,装载进内存list中
# docs = ["hello world", "hello city", "welcome to china"]
self.docs.append(doc)
return {
"trie": self.trie.root
}
def phraseSearch(self, phrase):
"""
执行短语查询,以空格的方式对phrase作分词,比如"hello world" ---> "hello", "world"
:param phrase: query phrase
:return: docIds
"""
assert isinstance(phrase, str)
if len(phrase.strip()) <= 0:
logging.error("query phrase cannot be empty")
arrs = deque()
for term in phrase.split():
arrs.append(self.trie.getDocIdsByTerm(term))
docsIdToLoad = IndexingSearch.multipleArraysUnion(arrs)
print("\n".join(["{}: {}".format(index, self.docs[index]) for index in docsIdToLoad]))
@staticmethod
def multipleArraysUnion(arrays):
"""
多个数组求并集,比如[1,2,3]与[2,3,5,7]并集结果为[1,2,3,5,7]
:param arrays: multiple arrays in deque
:return: list type, sorted array
"""
assert isinstance(arrays, deque)
if len(arrays) < 1:
logging.error("The number of ordered arrays is 0")
rawArray = arrays.popleft()
while len(arrays) > 0:
arr = arrays.popleft()
rawArray = list(set(rawArray).union(set(arr)))
return sorted(rawArray)
if __name__ == '__main__':
indexingSearch = IndexingSearch()
# 索引文档,内部组装简版的倒排索引
indexingSearch.indexingDocument("/root/hello/myfile")
# 执行短语查询
indexingSearch.phraseSearch("hello world")
小结
Python手写Lucene倒排索引小功能,这里为啥使用字典树来存储term呢?其实主要是为了节省空间,比如"app"与"apple"如果用哈希表来存储,则会分别存储"app"与"apple",而如果使用字典树则只会存储"a,p,p,l,e"这5个字母,存储空间节省了一些,试想一下,如果terms很多的情况下,字典树的这种方式会节省很多的存储空间;当然在字典树中去查找一个term,通常会比在哈希表中查找term耗时,字典树的查找时间复杂度为O(len(term))。