2019-10-31 18:33:21 +00:00
|
|
|
# A script to OCR/parse/mangle Collections label-images
|
|
|
|
# Note - this takes ~2 seconds per label-image
|
|
|
|
# (c) 2019 The Field Museum - MIT License (https://opensource.org/licenses/MIT)
|
|
|
|
# https://github.com/fieldmuseum/Collections-OCR
|
|
|
|
|
2019-11-12 19:03:20 +00:00
|
|
|
library(tidyr)
|
2019-10-31 18:33:21 +00:00
|
|
|
library(magick)
|
|
|
|
library(stringr)
|
|
|
|
library(tesseract)
|
|
|
|
|
|
|
|
|
|
|
|
# download relevant languages/training data
|
|
|
|
tesseract_download("lat") # Latin
|
|
|
|
tesseract_download("deu") # German
|
|
|
|
|
|
|
|
|
|
|
|
# get list of JPG & JPEG image files
|
2019-10-31 18:51:04 +00:00
|
|
|
imagelist <- list.files(path = "images/", pattern = ".jp|.JP")
|
2019-10-31 18:33:21 +00:00
|
|
|
|
|
|
|
|
|
|
|
# setup table for OCRed text
|
|
|
|
imagesOCR <- data.frame("image" = rep("", NROW(imagelist)),
|
|
|
|
"line_count" = rep("", NROW(imagelist)),
|
|
|
|
"text" = rep("", NROW(imagelist)),
|
|
|
|
stringsAsFactors = F)
|
|
|
|
|
|
|
|
imagesOCR$line_count <- as.integer(imagesOCR$line_count)
|
|
|
|
|
|
|
|
|
|
|
|
# loop through each label-image
|
|
|
|
for (i in 1:NROW(imagelist)) {
|
|
|
|
|
|
|
|
# OCR the image to text
|
|
|
|
ocrText <- image_read(paste0("images/", imagelist[i])) %>%
|
|
|
|
image_ocr(language = c("eng", "lat", "deu"))
|
|
|
|
imagesOCR$text[i] <- ocrText
|
|
|
|
|
|
|
|
# include filename & count of lines in row
|
|
|
|
imagesOCR$image[i] <- imagelist[i]
|
|
|
|
imagesOCR$line_count[i] <- str_count(ocrText, "\n")
|
|
|
|
|
|
|
|
# show progress
|
|
|
|
print(paste(i, " - ", Sys.time()))
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
# split text lines to separate columns
|
|
|
|
ocrText <- separate(imagesOCR, text,
|
|
|
|
into = paste0("Line",
|
|
|
|
seq(1:max(imagesOCR$line_count, na.rm = T))),
|
|
|
|
# into = seq(1:20), # if need consistent NCOL
|
|
|
|
sep = "\n",
|
|
|
|
extra = "merge", fill = "right")
|
|
|
|
|
2019-11-12 19:03:20 +00:00
|
|
|
|
2019-10-31 18:33:21 +00:00
|
|
|
# export CSV
|
|
|
|
write.csv(ocrText,
|
|
|
|
paste0("ocrText-",
|
|
|
|
gsub("\\s+|:", "", Sys.time()),
|
|
|
|
".csv"),
|
2019-10-31 18:51:04 +00:00
|
|
|
na = "",
|
2019-10-31 18:33:21 +00:00
|
|
|
row.names = F)
|