# A script to OCR/parse/mangle Collections label-images
# Note - this takes ~2 seconds per label-image
# (c) 2019 The Field Museum - MIT License (https://opensource.org/licenses/MIT)
# https://github.com/fieldmuseum/Collections-OCR


library(magick)
library(stringr)
library(tesseract)


# download relevant languages/training data
tesseract_download("lat")  # Latin
tesseract_download("deu")  # German


# get list of JPG & JPEG image files
imagelist <- list.files(path = "images/", pattern = ".jp|.JP")


# setup table for OCRed text
imagesOCR <- data.frame("image" = rep("", NROW(imagelist)),
                        "line_count" = rep("", NROW(imagelist)),
                        "text" = rep("", NROW(imagelist)),
                        stringsAsFactors = F)

imagesOCR$line_count <- as.integer(imagesOCR$line_count)


# loop through each label-image
for (i in 1:NROW(imagelist)) {

  # OCR the image to text
  ocrText <- image_read(paste0("images/", imagelist[i])) %>%
    image_ocr(language = c("eng", "lat", "deu"))
  imagesOCR$text[i] <- ocrText
  
  # include filename & count of lines in row
  imagesOCR$image[i] <- imagelist[i]
  imagesOCR$line_count[i] <- str_count(ocrText, "\n")
  
  # show progress
  print(paste(i, " - ", Sys.time()))

}


# split text lines to separate columns
ocrText <- separate(imagesOCR, text,
                    into = paste0("Line", 
                                  seq(1:max(imagesOCR$line_count, na.rm = T))),
                    # into = seq(1:20),  # if need consistent NCOL
                    sep = "\n",
                    extra = "merge", fill = "right")

# export CSV
write.csv(ocrText, 
          paste0("ocrText-",
                 gsub("\\s+|:", "", Sys.time()),
                 ".csv"),
          na = "",
          row.names = F)