handy-debian-scripts/thank-you-receipt-address-block-parser.py
Steve Dogiakos bfdffd156a
Create thank-you-receipt-address-block-parser.py
First try at a python parser to extract address blocks from historic thank you receipts.
2025-06-18 11:41:21 -06:00

66 lines
2.3 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Extract address blocks from donation-receipt PDFs produced by Paperless-NGX.
Usage:
python extract_addresses.py receipts.pdf [-o addresses.csv]
Requires:
pip install pdfplumber
"""
from pathlib import Path
import csv
import re
import sys
import pdfplumber
# ⚙️ --- settings -------------------------------------------------------------
ADDRESS_RE = re.compile(
r"""^(.+?)\n # line 1 person / org name
(\d{1,6} .+?)\n # line 2 street / unit (starts with digits)
([A-Za-z .'-]+,\s?[A-Z]{2}\s\d{5}(?:-\d{4})?)$ # line 3 City, ST ZIP
""",
re.MULTILINE | re.VERBOSE,
)
# -----------------------------------------------------------------------------
def extract_blocks(pdf_path: Path):
"""Yield (name, street, city_state_zip) tuples found in *pdf_path*."""
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
text = page.extract_text() or ""
# Paperless sometimes inserts double-spaces; normalise first
text = re.sub(r"[ \t]{2,}", " ", text)
for match in ADDRESS_RE.finditer(text):
yield match.groups()
break # only want the first address on a page
def main():
if len(sys.argv) < 2:
print("Usage: extract_addresses.py file.pdf [-o out.csv]", file=sys.stderr)
sys.exit(1)
pdf_file = Path(sys.argv[1]).expanduser()
out_csv = None
if "-o" in sys.argv:
out_csv = Path(sys.argv[sys.argv.index("-o") + 1]).expanduser()
blocks = list(extract_blocks(pdf_file))
# ── output ────────────────────────────────────────────────────────────────
if out_csv:
with out_csv.open("w", newline="") as f:
writer = csv.writer(f)
writer.writerow(["Name", "Street", "CityStateZip"])
writer.writerows(blocks)
print(f"Wrote {len(blocks)} addresses to {out_csv}")
else:
for name, street, city in blocks:
print(name)
print(street)
print(city)
print("-" * 40)
if __name__ == "__main__":
main()