Como tornar um PDF pesquisável para um aplicativo de pesquisa de balão?

Eu tenho pesquisado um projeto pessoal muito importante. Gostaria de criar um aplicativo de pesquisa em balão que me permita pesquisar conteúdo em 100 arquivos PDF Plus. Encontrei algumas informações em torno do A ElasticSearch Lib que funcionam bem com o balão.

#!/usr/bin/env python3
#-*- coding: utf-8 -*-

# import libraries to help read and create PDF
import PyPDF2
from fpdf import FPDF
import base64
import json
from flask import Flask, jsonify, request, render_template,  json
from datetime import datetime
import pandas as pd

# import the Elasticsearch low-level client library
from elasticsearch import Elasticsearch
# create a new client instance of Elasticsearch
elastic_client = Elasticsearch(hosts=["localhost"])
es = Elasticsearch("http://localhost:9200/")
app = Flask(__name__)

# create a new PDF object with FPDF
pdf = FPDF()

# use an iterator to create 10 pages
for page in range(10):
    pdf.add_page()
    pdf.set_font("Arial", size=14)
    pdf.cell(150, 12, txt="Object Rocket ROCKS!!", ln=1, align="C")

# output all of the data to a new PDF file
pdf.output("object_rocket.pdf")

'''
read_pdf = PyPDF2.PdfFileReader("object_rocket.pdf")
page = read_pdf.getPage(0)
page_mode = read_pdf.getPageMode()
page_text = page.extractText()
print (type(page_text))
'''
#with open(path, 'rb') as file:

# get the PDF path and read the file
file = "Sheet3.pdf"
read_pdf = PyPDF2.PdfFileReader(file, strict=False)
#print (read_pdf)

# get the read object's meta info
pdf_meta = read_pdf.getDocumentInfo()

# get the page numbers
num = read_pdf.getNumPages()
print ("PDF pages:", num)

# create a dictionary object for page data
all_pages = {}

# put meta data into a dict key
all_pages["meta"] = {}

# Use 'iteritems()` instead of 'items()' for Python 2
for meta, value in pdf_meta.items():
    print (meta, value)
    all_pages["meta"][meta] = value

# iterate the page numbers
for page in range(num):
    data = read_pdf.getPage(page)
    #page_mode = read_pdf.getPageMode()

    # extract the page's text
    page_text = data.extractText()

    # put the text data into the dict
    all_pages[page] = page_text

# create a JSON string from the dictionary
json_data = json.dumps(all_pages)
#print ("\nJSON:", json_data)

# convert JSON string to bytes-like obj
bytes_string = bytes(json_data, 'utf-8')
#print ("\nbytes_string:", bytes_string)

# convert bytes to base64 encoded string
encoded_pdf = base64.b64encode(bytes_string)
encoded_pdf = str(encoded_pdf)
#print ("\nbase64:", encoded_pdf)

# put the PDF data into a dictionary body to pass to the API request
body_doc = {"data": encoded_pdf}

# call the index() method to index the data
result = elastic_client.index(index="pdf", doc_type="_doc", id="42", body=body_doc)

# print the returned sresults
#print ("\nindex result:", result['result'])

# make another Elasticsearch API request to get the indexed PDF
result = elastic_client.get(index="pdf", doc_type='_doc', id=42)

# print the data to terminal
result_data = result["_source"]["data"]
#print ("\nresult_data:", result_data, '-- type:', type(result_data))

# decode the base64 data (use to [:] to slice off
# the 'b and ' in the string)
decoded_pdf = base64.b64decode(result_data[2:-1]).decode("utf-8")
#print ("\ndecoded_pdf:", decoded_pdf)

# take decoded string and make into JSON object
json_dict = json.loads(decoded_pdf)
#print ("\njson_str:", json_dict, "\n\ntype:", type(json_dict))
result2 = elastic_client.index(index="pdftext", doc_type="_doc", id="42", body=json_dict)

# create new FPDF object
pdf = FPDF()

# build the new PDF from the Elasticsearch dictionary
# Use 'iteritems()` instead of 'items()' for Python 2
""" for page, value in json_data:
    if page != "meta":
        # create new page
        pdf.add_page()
        pdf.set_font("Arial", size=14)

        # add content to page
        output = value + " -- Page: " + str(int(page)+1)
        pdf.cell(150, 12, txt=output, ln=1, align="C")
    else:
        # create the meta data for the new PDF
        for meta, meta_val in json_dict["meta"].items():
            if "title" in meta.lower():
                pdf.set_title(meta_val)
            elif "producer" in meta.lower() or "creator" in meta.lower():
                pdf.set_creator(meta_val)
 """
# output the PDF object's data to a PDF file
#pdf.output("object_rocket_from_elaticsearch.pdf" )

@app.route('/', methods=['GET'])
def index():

    return jsonify(json_dict)

@app.route('/<id>', methods=['GET'])
def index_by_id(id):

    return jsonify(json_dict[id])


""" @app.route('/insert_data', methods=['PUT'])
def insert_data():
    slug = request.form['slug']
    title = request.form['title']
    content = request.form['content']

    body = {
        'slug': slug,
        'title': title,
        'content': content,
        'timestamp': datetime.now()
    }

    result = es.index(index='contents', doc_type='title', id=slug, body=body)

    return jsonify(result) """



app.run(port=5003, debug=True)

------ Progresso ------ Agora, tenho uma solução funcional sem capacidade de pesquisa de front-end:

# Load_single_PDF_BY_PAGE_TO_index.py
  #!/usr/bin/env python3
#-*- coding: utf-8 -*-

# import libraries to help read and create PDF
import PyPDF2
from fpdf import FPDF
import base64

from flask import Flask, jsonify, request, render_template,  json
from datetime import datetime
import pandas as pd

# import the Elasticsearch low-level client library
from elasticsearch import Elasticsearch
# create a new client instance of Elasticsearch
elastic_client = Elasticsearch(hosts=["localhost"])
es = Elasticsearch("http://localhost:9200/")
app = Flask(__name__)


#with open(path, 'rb') as file:

# get the PDF path and read the file
file = "Sheet3.pdf"
read_pdf = PyPDF2.PdfFileReader(file, strict=False)
#print (read_pdf)

# get the read object's meta info
pdf_meta = read_pdf.getDocumentInfo()

# get the page numbers
num = read_pdf.getNumPages()
print ("PDF pages:", num)

# create a dictionary object for page data
all_pages = {}

# put meta data into a dict key
all_pages["meta"] = {}

# Use 'iteritems()` instead of 'items()' for Python 2
for meta, value in pdf_meta.items():
    print (meta, value)
    all_pages["meta"][meta] = value

x = 44
# iterate the page numbers
for page in range(num):
    data = read_pdf.getPage(page)
    #page_mode = read_pdf.getPageMode()

    # extract the page's text
    page_text = data.extractText()

    # put the text data into the dict
    all_pages[page] = page_text

    body_doc2 = {"data": page_text}
    result3 = elastic_client.index(index="pdfclearn", doc_type="_doc", id=x, body=body_doc2)
    x += 1

O código acima carrega um único pdf no elasticsearch por página.

from flask import Flask, jsonify, request,render_template
from elasticsearch import Elasticsearch
from datetime import datetime
es = Elasticsearch("http://localhost:9200/")

app = Flask(__name__)

@app.route('/pdf', methods=['GET'])
def index():
    results = es.get(index='pdfclearn', doc_type='_doc', id='44')
    return jsonify(results['_source'])


@app.route('/pdf/<id>', methods=['GET'])
def index_by_id(id):
    results = es.get(index='pdfclearn', doc_type='_doc', id=id)
    return jsonify(results['_source'])



@app.route('/search/<keyword>', methods=['POST','GET'])
def search(keyword):
    keyword = keyword

    body = {
        "query": {
            "multi_match": {
                "query": keyword,
                "fields": ["data"]
            }
        }
    }

    res = es.search(index="pdfclearn", doc_type="_doc", body=body)

    return jsonify(res['hits']['hits'])

@app.route("/searhbar")
def searhbar():
    return render_template("index.html")

@app.route("/searhbar/<string:box>")
def process(box):
    query = request.args.get('query')
    if box == 'names':
         keyword = box

    body = {
        "query": {
            "multi_match": {
                "query": keyword,
                "fields": ["data"]
            }
        }
    }

    res = es.search(index="pdfclearn", doc_type="_doc", body=body)

    return jsonify(res['hits']['hits'])

app.run(port=5003, debug=True)

No código acima, podemos pesquisar em todas as páginas uma palavra-chave ou frase.

curl http://127.0.0.1:5003/search/test //it works!!

Encontrei um blog sobre como pesquisar arquivos PDF como um índice Base64 no ElasticSearch. Vi a API da DocuSign fazer isso para modelagem de documentos. No entanto, eu não entendo Como Jsonify o PDF Base64 de uma maneira que seja pesquisável no ElasticSearch.

curl "http://localhost:9200/pdftext/_doc/42"

curl -X POST "http://localhost:9200/pdf/_search?q=*"

Posso recuperar o Base64 de um documento de 700 páginas. Mas acho que o que preciso é indexar e recuperar cada página do documento.

Blogs que eu estudei que me fizeram parte do caminho:

final de jogo:

https://towardsdatascience.com/create-a-full-search-engine-via-flask-elasticsearch-javascript-d3js-and-bootstrap-275f9dc6efe1

Continuarei estudando a Elastic Search, a Codificação e a decodificação Base64. Mas gostaria de alguma ajuda para alcançar meu objetivo. Qualquer exemplo detalhado seria muito apreciado.

python flask search Raposa Negra
fonte

Encontrado um Lib para Python, Whoosh: whoosh.readthedocs.io/en/latest/intro.html . Vou tentar uma nova abordagem com esta lib próxima

BlackFox 08/02

Agora testando uma lib para python, Scout: scout.readthedocs.io/en/latest/installation.html

BlackFox

Como tornar um PDF pesquisável para um aplicativo de pesquisa de balão?

Respostas: