mirror of
https://github.com/snachodog/handy-debian-scripts.git
synced 2025-07-01 00:22:23 -06:00
Create thank-you-receipt-address-block-parser.py
First try at a python parser to extract address blocks from historic thank you receipts.
This commit is contained in:
parent
abf18918d9
commit
bfdffd156a
65
thank-you-receipt-address-block-parser.py
Normal file
65
thank-you-receipt-address-block-parser.py
Normal file
@ -0,0 +1,65 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Extract address blocks from donation-receipt PDFs produced by Paperless-NGX.
|
||||
|
||||
Usage:
|
||||
python extract_addresses.py receipts.pdf [-o addresses.csv]
|
||||
Requires:
|
||||
pip install pdfplumber
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
import csv
|
||||
import re
|
||||
import sys
|
||||
import pdfplumber
|
||||
|
||||
# ⚙️ --- settings -------------------------------------------------------------
|
||||
ADDRESS_RE = re.compile(
|
||||
r"""^(.+?)\n # line 1 – person / org name
|
||||
(\d{1,6} .+?)\n # line 2 – street / unit (starts with digits)
|
||||
([A-Za-z .'-]+,\s?[A-Z]{2}\s\d{5}(?:-\d{4})?)$ # line 3 – City, ST ZIP
|
||||
""",
|
||||
re.MULTILINE | re.VERBOSE,
|
||||
)
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
def extract_blocks(pdf_path: Path):
|
||||
"""Yield (name, street, city_state_zip) tuples found in *pdf_path*."""
|
||||
with pdfplumber.open(pdf_path) as pdf:
|
||||
for page in pdf.pages:
|
||||
text = page.extract_text() or ""
|
||||
# Paperless sometimes inserts double-spaces; normalise first
|
||||
text = re.sub(r"[ \t]{2,}", " ", text)
|
||||
for match in ADDRESS_RE.finditer(text):
|
||||
yield match.groups()
|
||||
break # only want the first address on a page
|
||||
|
||||
def main():
|
||||
if len(sys.argv) < 2:
|
||||
print("Usage: extract_addresses.py file.pdf [-o out.csv]", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
pdf_file = Path(sys.argv[1]).expanduser()
|
||||
out_csv = None
|
||||
if "-o" in sys.argv:
|
||||
out_csv = Path(sys.argv[sys.argv.index("-o") + 1]).expanduser()
|
||||
|
||||
blocks = list(extract_blocks(pdf_file))
|
||||
|
||||
# ── output ────────────────────────────────────────────────────────────────
|
||||
if out_csv:
|
||||
with out_csv.open("w", newline="") as f:
|
||||
writer = csv.writer(f)
|
||||
writer.writerow(["Name", "Street", "CityStateZip"])
|
||||
writer.writerows(blocks)
|
||||
print(f"Wrote {len(blocks)} addresses to {out_csv}")
|
||||
else:
|
||||
for name, street, city in blocks:
|
||||
print(name)
|
||||
print(street)
|
||||
print(city)
|
||||
print("-" * 40)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
x
Reference in New Issue
Block a user