Source code for gnes.indexer.doc.filesys

import os
from typing import List

from ..base import BaseDocIndexer as BDI
from ...proto import gnes_pb2


[docs]class DirectoryIndexer(BDI):

    def __init__(self, data_path: str,
                 keep_na_doc: bool = True,
                 file_suffix: str = 'gif',
                 *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.data_path = data_path
        self.file_suffix = file_suffix
        self.keep_na_doc = keep_na_doc
        self._NOT_FOUND = None

[docs]    @BDI.update_counter
    def add(self, keys: List[int], docs: List['gnes_pb2.Document'], *args, **kwargs):
        """
        write GIFs of each document into disk
        folder structure: /data_path/doc_id/0.gif, 1.gif...

        :param keys: list of doc id
        :param docs: list of docs
        """
        for k, d in zip(keys, docs):
            dirs = os.path.join(self.data_path, str(k))
            if not os.path.exists(dirs):
                os.makedirs(dirs)
            # keep doc meta in .meta file
            with open(os.path.join(dirs, '.meta'), 'wb') as f:
                f.write(d.meta_info or b'')

            for i, chunk in enumerate(d.chunks):
                with open(os.path.join(dirs, '%d.%s' % (i, self.file_suffix)), 'wb') as f:
                    f.write(chunk.raw)

[docs]    def query(self, keys: List[int], *args, **kwargs) -> List['gnes_pb2.Document']:
        """
        Find the doc according to the keys

        :param keys: list of doc id
        :return: list of documents whose chunks field contain all the GIFs of this doc(one GIF per chunk)
        """
        res = []
        for k in keys:
            doc = gnes_pb2.Document()
            target_dirs = os.path.join(self.data_path, str(k))

            if not os.path.exists(target_dirs):
                if self.keep_na_doc:
                    res.append(self._NOT_FOUND)
            else:
                with open(os.path.join(target_dirs, '.meta'), 'rb') as f:
                    doc.meta_info = f.read()
                for raw_file in os.listdir(target_dirs):
                    if not os.path.isdir(raw_file):
                        c = doc.chunks.add()
                        c.doc_id = k
                        with open(os.path.join(target_dirs, raw_file), 'rb') as raw:
                            c.raw = raw.read()
                res.append(doc)
        return res