Skip to content

Commit

Permalink
Add support for .txt file format
Browse files Browse the repository at this point in the history
  • Loading branch information
RyosukeDTomita committed Jun 27, 2024
1 parent c5cf715 commit 3520008
Show file tree
Hide file tree
Showing 2 changed files with 99 additions and 34 deletions.
132 changes: 98 additions & 34 deletions 5.internal-document-search/scripts/prepdocs.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
parser = argparse.ArgumentParser(
description="Prepare documents by extracting content from PDFs, splitting content into sections, uploading to blob storage, and indexing in a search index.",
epilog="Example: prepdocs.py '..\data\*' --storageaccount myaccount --container mycontainer --searchservice mysearch --index myindex -v"
)
)
parser.add_argument("files", help="Files to be processed")
parser.add_argument("--category", help="Value for the category field in the search index for all sections indexed in this run")
parser.add_argument("--skipblobs", action="store_true", help="Skip uploading individual pages to Azure Blob Storage")
Expand Down Expand Up @@ -59,12 +59,14 @@
exit(1)
formrecognizer_creds = default_creds if args.formrecognizerkey is None else AzureKeyCredential(args.formrecognizerkey)

def blob_name_from_file_page(filename, page = 0):

def blob_name_from_file_page(filename, page=0):
if os.path.splitext(filename)[1].lower() == ".pdf":
return os.path.splitext(os.path.basename(filename))[0] + f"-{page}" + ".pdf"
else:
return os.path.basename(filename)


def upload_blobs(filename):
blob_service = BlobServiceClient(account_url=f"https://{args.storageaccount}.blob.core.windows.net", credential=storage_creds)
blob_container = blob_service.get_container_client(args.container)
Expand All @@ -77,7 +79,8 @@ def upload_blobs(filename):
pages = reader.pages
for i in range(len(pages)):
blob_name = blob_name_from_file_page(filename, i)
if args.verbose: print(f"\tUploading blob for page {i} -> {blob_name}")
if args.verbose:
print(f"\tUploading blob for page {i} -> {blob_name}")
f = io.BytesIO()
writer = PdfWriter()
writer.add_page(pages[i])
Expand All @@ -86,11 +89,13 @@ def upload_blobs(filename):
blob_container.upload_blob(blob_name, f, overwrite=True)
else:
blob_name = blob_name_from_file_page(filename)
with open(filename,"rb") as data:
with open(filename, "rb") as data:
blob_container.upload_blob(blob_name, data, overwrite=True)


def remove_blobs(filename):
if args.verbose: print(f"Removing blobs for '{filename or '<all>'}'")
if args.verbose:
print(f"Removing blobs for '{filename or '<all>'}'")
blob_service = BlobServiceClient(account_url=f"https://{args.storageaccount}.blob.core.windows.net", credential=storage_creds)
blob_container = blob_service.get_container_client(args.container)
if blob_container.exists():
Expand All @@ -100,9 +105,11 @@ def remove_blobs(filename):
prefix = os.path.splitext(os.path.basename(filename))[0]
blobs = filter(lambda b: re.match(f"{prefix}-\d+\.pdf", b), blob_container.list_blob_names(name_starts_with=os.path.splitext(os.path.basename(prefix))[0]))
for b in blobs:
if args.verbose: print(f"\tRemoving blob {b}")
if args.verbose:
print(f"\tRemoving blob {b}")
blob_container.delete_blob(b)


def table_to_html(table):
table_html = "<table>"
rows = [sorted([cell for cell in table.cells if cell.row_index == i], key=lambda cell: cell.column_index) for i in range(table.row_count)]
Expand All @@ -111,14 +118,30 @@ def table_to_html(table):
for cell in row_cells:
tag = "th" if (cell.kind == "columnHeader" or cell.kind == "rowHeader") else "td"
cell_spans = ""
if cell.column_span > 1: cell_spans += f" colSpan={cell.column_span}"
if cell.row_span > 1: cell_spans += f" rowSpan={cell.row_span}"
if cell.column_span > 1:
cell_spans += f" colSpan={cell.column_span}"
if cell.row_span > 1:
cell_spans += f" rowSpan={cell.row_span}"
table_html += f"<{tag}{cell_spans}>{html.escape(cell.content)}</{tag}>"
table_html +="</tr>"
table_html += "</tr>"
table_html += "</table>"
return table_html


def get_document_text(filename):
file_extensions = os.path.splitext(filename)[1].lower()
match file_extensions:
case str(".pdf"):
print("pdf")
page_map = _parse_pdf(filename)
case str(".txt"):
print("txt")
page_map = _parse_txt(filename)

return page_map


def _parse_pdf(filename: str) -> list:
offset = 0
page_map = []
if args.localpdfparser:
Expand All @@ -129,10 +152,11 @@ def get_document_text(filename):
page_map.append((page_num, offset, page_text))
offset += len(page_text)
else:
if args.verbose: print(f"Extracting text from '{filename}' using Azure Form Recognizer")
if args.verbose:
print(f"Extracting text from '{filename}' using Azure Form Recognizer")
form_recognizer_client = DocumentAnalysisClient(endpoint=f"https://{args.formrecognizerservice}.cognitiveservices.azure.com/", credential=formrecognizer_creds, headers={"x-ms-useragent": "azure-search-chat-demo/1.0.0"})
with open(filename, "rb") as f:
poller = form_recognizer_client.begin_analyze_document("prebuilt-layout", document = f)
poller = form_recognizer_client.begin_analyze_document("prebuilt-layout", document=f) # ここでエラー
form_recognizer_results = poller.result()

for page_num, page in enumerate(form_recognizer_results.pages):
Expand All @@ -141,13 +165,13 @@ def get_document_text(filename):
# mark all positions of the table spans in the page
page_offset = page.spans[0].offset
page_length = page.spans[0].length
table_chars = [-1]*page_length
table_chars = [-1] * page_length
for table_id, table in enumerate(tables_on_page):
for span in table.spans:
# replace all table spans with "table_id" in table_chars array
for i in range(span.length):
idx = span.offset - page_offset + i
if idx >=0 and idx < page_length:
if idx >= 0 and idx < page_length:
table_chars[idx] = table_id

# build page text by replacing charcters in table spans with table html
Expand All @@ -163,13 +187,38 @@ def get_document_text(filename):
page_text += " "
page_map.append((page_num, offset, page_text))
offset += len(page_text)
return page_map


def _parse_txt(filename: str) -> list:
page_text = ""
page_num = 0
offset = 0
if args.verbose:
print(f"Extracting text from '{filename}' using ")
# check file encoding format
with open(filename, 'rb') as f:
raw_data = f.read()
result = chardet.detect(raw_data)
encoding = result['encoding']
with open(filename, 'r', encoding=encoding) as f:
pages = f.readlines()
for i, line in enumerate(pages):
page_text += line
# 20 lines are considered a page.
if i % 20 == 0:
page_map.append((page_num, offset, page_text))
page_num += 1
offset += len(page_text)
page_text = ""
return page_map


def split_text(page_map):
SENTENCE_ENDINGS = [".", "!", "?"]
WORDS_BREAKS = [",", ";", ":", " ", "(", ")", "[", "]", "{", "}", "\t", "\n"]
if args.verbose: print(f"Splitting '{filename}' into sections")
if args.verbose:
print(f"Splitting '{filename}' into sections")

def find_page(offset):
l = len(page_map)
Expand All @@ -195,7 +244,7 @@ def find_page(offset):
last_word = end
end += 1
if end < length and all_text[end] not in SENTENCE_ENDINGS and last_word > 0:
end = last_word # Fall back to at least keeping a whole word
end = last_word # Fall back to at least keeping a whole word
if end < length:
end += 1

Expand All @@ -218,26 +267,30 @@ def find_page(offset):
# If the section ends with an unclosed table, we need to start the next section with the table.
# If table starts inside SENTENCE_SEARCH_LIMIT, we ignore it, as that will cause an infinite loop for tables longer than MAX_SECTION_LENGTH
# If last table starts inside SECTION_OVERLAP, keep overlapping
if args.verbose: print(f"Section ends with unclosed table, starting next section with the table at page {find_page(start)} offset {start} table start {last_table_start}")
if args.verbose:
print(f"Section ends with unclosed table, starting next section with the table at page {find_page(start)} offset {start} table start {last_table_start}")
start = min(end - SECTION_OVERLAP, start + last_table_start)
else:
start = end - SECTION_OVERLAP

if start + SECTION_OVERLAP < end:
yield (all_text[start:end], find_page(start))


def create_sections(filename, page_map):
for i, (section, pagenum) in enumerate(split_text(page_map)):
yield {
"id": re.sub("[^0-9a-zA-Z_-]","_",f"{filename}-{i}"),
"id": re.sub("[^0-9a-zA-Z_-]", "_", f"{filename}-{i}"),
"content": section,
"category": args.category,
"sourcepage": blob_name_from_file_page(filename, pagenum),
"sourcefile": filename
}


def create_search_index():
if args.verbose: print(f"Ensuring search index {args.index} exists")
if args.verbose:
print(f"Ensuring search index {args.index} exists")
index_client = SearchIndexClient(endpoint=f"https://{args.searchservice}.search.windows.net/",
credential=search_creds)
if args.index not in index_client.list_index_names():
Expand All @@ -256,16 +309,20 @@ def create_search_index():
prioritized_fields=PrioritizedFields(
title_field=None, prioritized_content_fields=[SemanticField(field_name='content')]))])
)
if args.verbose: print(f"Creating {args.index} search index")
if args.verbose:
print(f"Creating {args.index} search index")
index_client.create_index(index)
else:
if args.verbose: print(f"Search index {args.index} already exists")
if args.verbose:
print(f"Search index {args.index} already exists")


def index_sections(filename, sections):
if args.verbose: print(f"Indexing sections from '{filename}' into search index '{args.index}'")
if args.verbose:
print(f"Indexing sections from '{filename}' into search index '{args.index}'")
search_client = SearchClient(endpoint=f"https://{args.searchservice}.search.windows.net/",
index_name=args.index,
credential=search_creds)
index_name=args.index,
credential=search_creds)
i = 0
batch = []
for s in sections:
Expand All @@ -274,39 +331,46 @@ def index_sections(filename, sections):
if i % 1000 == 0:
results = search_client.upload_documents(documents=batch)
succeeded = sum([1 for r in results if r.succeeded])
if args.verbose: print(f"\tIndexed {len(results)} sections, {succeeded} succeeded")
if args.verbose:
print(f"\tIndexed {len(results)} sections, {succeeded} succeeded")
batch = []

if len(batch) > 0:
results = search_client.upload_documents(documents=batch)
succeeded = sum([1 for r in results if r.succeeded])
if args.verbose: print(f"\tIndexed {len(results)} sections, {succeeded} succeeded")
if args.verbose:
print(f"\tIndexed {len(results)} sections, {succeeded} succeeded")


def remove_from_index(filename):
if args.verbose: print(f"Removing sections from '{filename or '<all>'}' from search index '{args.index}'")
if args.verbose:
print(f"Removing sections from '{filename or '<all>'}' from search index '{args.index}'")
search_client = SearchClient(endpoint=f"https://{args.searchservice}.search.windows.net/",
index_name=args.index,
credential=search_creds)
index_name=args.index,
credential=search_creds)
while True:
filter = None if filename is None else f"sourcefile eq '{os.path.basename(filename)}'"
r = search_client.search("", filter=filter, top=1000, include_total_count=True)
if r.get_count() == 0:
break
r = search_client.delete_documents(documents=[{ "id": d["id"] } for d in r])
if args.verbose: print(f"\tRemoved {len(r)} sections from index")
r = search_client.delete_documents(documents=[{"id": d["id"]} for d in r])
if args.verbose:
print(f"\tRemoved {len(r)} sections from index")
# It can take a few seconds for search results to reflect changes, so wait a bit
time.sleep(2)


if args.removeall:
remove_blobs(None)
remove_from_index(None)
else:
if not args.remove:
create_search_index()

print("Processing files...")
for filename in glob.glob(args.files):
if args.verbose: print(f"Processing '{filename}'")
if args.verbose:
print(f"Processing '{filename}'")
if args.remove:
remove_blobs(filename)
remove_from_index(filename)
Expand All @@ -318,4 +382,4 @@ def remove_from_index(filename):
upload_blobs(filename)
page_map = get_document_text(filename)
sections = create_sections(os.path.basename(filename), page_map)
index_sections(os.path.basename(filename), sections)
index_sections(os.path.basename(filename), sections)
1 change: 1 addition & 0 deletions 5.internal-document-search/scripts/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ azure-ai-formrecognizer==3.2.1
azure-storage-blob==12.14.1
typing==3.7.4.3
pycryptodome==3.19.1
chardet==4.0.0

0 comments on commit 3520008

Please sign in to comment.