Create thank-you-receipt-address-block-parser.py

First try at a python parser to extract address blocks from historic thank you receipts.
This commit is contained in:
Steve Dogiakos 2025-06-18 11:41:21 -06:00 committed by GitHub
parent abf18918d9
commit bfdffd156a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194

View File

@ -0,0 +1,65 @@
#!/usr/bin/env python3
"""
Extract address blocks from donation-receipt PDFs produced by Paperless-NGX.
Usage:
python extract_addresses.py receipts.pdf [-o addresses.csv]
Requires:
pip install pdfplumber
"""
from pathlib import Path
import csv
import re
import sys
import pdfplumber
# ⚙️ --- settings -------------------------------------------------------------
ADDRESS_RE = re.compile(
r"""^(.+?)\n # line 1 person / org name
(\d{1,6} .+?)\n # line 2 street / unit (starts with digits)
([A-Za-z .'-]+,\s?[A-Z]{2}\s\d{5}(?:-\d{4})?)$ # line 3 City, ST ZIP
""",
re.MULTILINE | re.VERBOSE,
)
# -----------------------------------------------------------------------------
def extract_blocks(pdf_path: Path):
"""Yield (name, street, city_state_zip) tuples found in *pdf_path*."""
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
text = page.extract_text() or ""
# Paperless sometimes inserts double-spaces; normalise first
text = re.sub(r"[ \t]{2,}", " ", text)
for match in ADDRESS_RE.finditer(text):
yield match.groups()
break # only want the first address on a page
def main():
if len(sys.argv) < 2:
print("Usage: extract_addresses.py file.pdf [-o out.csv]", file=sys.stderr)
sys.exit(1)
pdf_file = Path(sys.argv[1]).expanduser()
out_csv = None
if "-o" in sys.argv:
out_csv = Path(sys.argv[sys.argv.index("-o") + 1]).expanduser()
blocks = list(extract_blocks(pdf_file))
# ── output ────────────────────────────────────────────────────────────────
if out_csv:
with out_csv.open("w", newline="") as f:
writer = csv.writer(f)
writer.writerow(["Name", "Street", "CityStateZip"])
writer.writerows(blocks)
print(f"Wrote {len(blocks)} addresses to {out_csv}")
else:
for name, street, city in blocks:
print(name)
print(street)
print(city)
print("-" * 40)
if __name__ == "__main__":
main()