import os
from typing import List
from ..base import BaseDocIndexer as BDI
from ...proto import gnes_pb2
[docs]class DirectoryIndexer(BDI):
def __init__(self, data_path: str,
keep_na_doc: bool = True,
file_suffix: str = 'gif',
*args, **kwargs):
super().__init__(*args, **kwargs)
self.data_path = data_path
self.file_suffix = file_suffix
self.keep_na_doc = keep_na_doc
self._NOT_FOUND = None
[docs] @BDI.update_counter
def add(self, keys: List[int], docs: List['gnes_pb2.Document'], *args, **kwargs):
"""
write GIFs of each document into disk
folder structure: /data_path/doc_id/0.gif, 1.gif...
:param keys: list of doc id
:param docs: list of docs
"""
for k, d in zip(keys, docs):
dirs = os.path.join(self.data_path, str(k))
if not os.path.exists(dirs):
os.makedirs(dirs)
# keep doc meta in .meta file
with open(os.path.join(dirs, '.meta'), 'wb') as f:
f.write(d.meta_info or b'')
for i, chunk in enumerate(d.chunks):
with open(os.path.join(dirs, '%d.%s' % (i, self.file_suffix)), 'wb') as f:
f.write(chunk.raw)
[docs] def query(self, keys: List[int], *args, **kwargs) -> List['gnes_pb2.Document']:
"""
Find the doc according to the keys
:param keys: list of doc id
:return: list of documents whose chunks field contain all the GIFs of this doc(one GIF per chunk)
"""
res = []
for k in keys:
doc = gnes_pb2.Document()
target_dirs = os.path.join(self.data_path, str(k))
if not os.path.exists(target_dirs):
if self.keep_na_doc:
res.append(self._NOT_FOUND)
else:
with open(os.path.join(target_dirs, '.meta'), 'rb') as f:
doc.meta_info = f.read()
for raw_file in os.listdir(target_dirs):
if not os.path.isdir(raw_file):
c = doc.chunks.add()
c.doc_id = k
with open(os.path.join(target_dirs, raw_file), 'rb') as raw:
c.raw = raw.read()
res.append(doc)
return res