Source code for gnes.indexer.doc.filesys

import os
from typing import List

from ..base import BaseDocIndexer as BDI
from ...proto import gnes_pb2

[docs]class DirectoryIndexer(BDI): def __init__(self, data_path: str, keep_na_doc: bool = True, file_suffix: str = 'gif', *args, **kwargs): super().__init__(*args, **kwargs) self.data_path = data_path self.file_suffix = file_suffix self.keep_na_doc = keep_na_doc self._NOT_FOUND = None
[docs] @BDI.update_counter def add(self, keys: List[int], docs: List['gnes_pb2.Document'], *args, **kwargs): """ write GIFs of each document into disk folder structure: /data_path/doc_id/0.gif, 1.gif... :param keys: list of doc id :param docs: list of docs """ for k, d in zip(keys, docs): dirs = os.path.join(self.data_path, str(k)) if not os.path.exists(dirs): os.makedirs(dirs) # keep doc meta in .meta file with open(os.path.join(dirs, '.meta'), 'wb') as f: f.write(d.meta_info or b'') for i, chunk in enumerate(d.chunks): with open(os.path.join(dirs, '%d.%s' % (i, self.file_suffix)), 'wb') as f: f.write(chunk.raw)
[docs] def query(self, keys: List[int], *args, **kwargs) -> List['gnes_pb2.Document']: """ Find the doc according to the keys :param keys: list of doc id :return: list of documents whose chunks field contain all the GIFs of this doc(one GIF per chunk) """ res = [] for k in keys: doc = gnes_pb2.Document() target_dirs = os.path.join(self.data_path, str(k)) if not os.path.exists(target_dirs): if self.keep_na_doc: res.append(self._NOT_FOUND) else: with open(os.path.join(target_dirs, '.meta'), 'rb') as f: doc.meta_info = for raw_file in os.listdir(target_dirs): if not os.path.isdir(raw_file): c = doc.chunks.add() c.doc_id = k with open(os.path.join(target_dirs, raw_file), 'rb') as raw: c.raw = res.append(doc) return res