Added script and sample images

2019-10-31 13:33:21 -05:00 · 2019-10-31 13:33:21 -05:00 · ab8fc3970f
parent 6b3c6fa857
commit ab8fc3970f
1 changed files with 61 additions and 0 deletions
--- a/ocrMangle.R
+++ b/ocrMangle.R
@ -0,0 +1,61 @@
+# A script to OCR/parse/mangle Collections label-images
+# Note - this takes ~2 seconds per label-image
+# (c) 2019 The Field Museum - MIT License (https://opensource.org/licenses/MIT)
+# https://github.com/fieldmuseum/Collections-OCR
+
+
+library(magick)
+library(stringr)
+library(tesseract)
+
+
+# download relevant languages/training data
+tesseract_download("lat")  # Latin
+tesseract_download("deu")  # German
+
+
+# get list of JPG & JPEG image files
+imagelist <- list.files(path = "images/", pattern = ".jp")
+
+
+# setup table for OCRed text
+imagesOCR <- data.frame("image" = rep("", NROW(imagelist)),
+                        "line_count" = rep("", NROW(imagelist)),
+                        "text" = rep("", NROW(imagelist)),
+                        stringsAsFactors = F)
+
+imagesOCR$line_count <- as.integer(imagesOCR$line_count)
+
+
+# loop through each label-image
+for (i in 1:NROW(imagelist)) {
+
+  # OCR the image to text
+  ocrText <- image_read(paste0("images/", imagelist[i])) %>%
+    image_ocr(language = c("eng", "lat", "deu"))
+  imagesOCR$text[i] <- ocrText
+  
+  # include filename & count of lines in row
+  imagesOCR$image[i] <- imagelist[i]
+  imagesOCR$line_count[i] <- str_count(ocrText, "\n")
+  
+  # show progress
+  print(paste(i, " - ", Sys.time()))
+
+}
+
+
+# split text lines to separate columns
+ocrText <- separate(imagesOCR, text,
+                    into = paste0("Line", 
+                                  seq(1:max(imagesOCR$line_count, na.rm = T))),
+                    # into = seq(1:20),  # if need consistent NCOL
+                    sep = "\n",
+                    extra = "merge", fill = "right")
+
+# export CSV
+write.csv(ocrText, 
+          paste0("ocrText-",
+                 gsub("\\s+|:", "", Sys.time()),
+                 ".csv"),
+          row.names = F)