From ab8fc3970f52cad3682b03fd726be3f5a8c3bcf4 Mon Sep 17 00:00:00 2001 From: Kate Webbink Date: Thu, 31 Oct 2019 13:33:21 -0500 Subject: [PATCH] Added script and sample images --- ocrMangle.R | 61 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 61 insertions(+) create mode 100644 ocrMangle.R diff --git a/ocrMangle.R b/ocrMangle.R new file mode 100644 index 0000000..155695b --- /dev/null +++ b/ocrMangle.R @@ -0,0 +1,61 @@ +# A script to OCR/parse/mangle Collections label-images +# Note - this takes ~2 seconds per label-image +# (c) 2019 The Field Museum - MIT License (https://opensource.org/licenses/MIT) +# https://github.com/fieldmuseum/Collections-OCR + + +library(magick) +library(stringr) +library(tesseract) + + +# download relevant languages/training data +tesseract_download("lat") # Latin +tesseract_download("deu") # German + + +# get list of JPG & JPEG image files +imagelist <- list.files(path = "images/", pattern = ".jp") + + +# setup table for OCRed text +imagesOCR <- data.frame("image" = rep("", NROW(imagelist)), + "line_count" = rep("", NROW(imagelist)), + "text" = rep("", NROW(imagelist)), + stringsAsFactors = F) + +imagesOCR$line_count <- as.integer(imagesOCR$line_count) + + +# loop through each label-image +for (i in 1:NROW(imagelist)) { + + # OCR the image to text + ocrText <- image_read(paste0("images/", imagelist[i])) %>% + image_ocr(language = c("eng", "lat", "deu")) + imagesOCR$text[i] <- ocrText + + # include filename & count of lines in row + imagesOCR$image[i] <- imagelist[i] + imagesOCR$line_count[i] <- str_count(ocrText, "\n") + + # show progress + print(paste(i, " - ", Sys.time())) + +} + + +# split text lines to separate columns +ocrText <- separate(imagesOCR, text, + into = paste0("Line", + seq(1:max(imagesOCR$line_count, na.rm = T))), + # into = seq(1:20), # if need consistent NCOL + sep = "\n", + extra = "merge", fill = "right") + +# export CSV +write.csv(ocrText, + paste0("ocrText-", + gsub("\\s+|:", "", Sys.time()), + ".csv"), + row.names = F) \ No newline at end of file