Collections-OCR/ocrCloudVision.R

# A script to use Google Cloud Vision to OCR/parse/mangle Collections label-images
# Note!
#   - this may take >30 seconds per label-image
#   - running >1000 API calls/month incurs a fee
# (c) 2019 The Field Museum - MIT License (https://opensource.org/licenses/MIT)
# https://github.com/fieldmuseum/Collections-OCR

library(googleCloudVisionR)  # NOTE - requires API Key / Service Account
library(tidyr)
library(readr)
library(stringr)
# library(magick)

# get list of local JPG & JPEG image files [REVERT]
imagelist <- list.files(path = "images/", pattern = ".jp|.JP")
imagenames <- gsub(".jp.*|.JP.*", "", imagelist)


# # Prompt user for input/output batch directory names?
# image_dir <- readline("Paste the path for the image directory: ")


# Retrieve OCR text ####

# Setup table for OCRed text
imagesOCR <- data.frame("image" = rep("", NROW(imagelist)),
                        "imagesize_MB" = rep("", NROW(imagelist)),
                        "ocr_start" = rep("", NROW(imagelist)),
                        "ocr_duration" = rep("", NROW(imagelist)),
                        "line_count" = rep("", NROW(imagelist)),
                        "text" = rep("", NROW(imagelist)),
                        stringsAsFactors = F)

imagesOCR$line_count <- as.integer(imagesOCR$line_count)


# setup output dir
# # add image_dir if use prompt above
if (!dir.exists("ocr_text")) {  # paste0(image_dir, "_out")
  dir.create("ocr_text")  # paste0(image_dir, "_out")
  print("output directory created")
} else {
  print("output directory already exists")
}


# Loop through each label-image
for (i in 1:NROW(imagelist)) {
  
  # # If files are over 20MB, uncomment this to lower quality + avoid error?
  # ### NOTE! This will overwrite image with lower-quality file.
  # 
  # if (file.info(paste0("images/", imagelist[i]))$size > 20000000) {
  #   image_write(image_read(paste0("images/", imagelist[i])),
  #               path = paste0("images/", imagelist[i]),
  #               quality = 80)

  # OCR image
  # ### NOTE! This can take over ~30s per image
  print(paste(i, "- starting OCR -", Sys.time()))
  
  imagesOCR$ocr_start[i] <- as.character(Sys.time())
  start <- Sys.time()
  
  ocr_list <- gcv_get_image_annotations(imagePaths = paste0("images/", imagelist[i]),
                                        feature = "DOCUMENT_TEXT_DETECTION") #,
                                        # savePath = paste0("ocr_text/", 
                                        #                   imagenames[i], "_text.csv"))
  
  print(paste(i, "- finishing OCR -", Sys.time()))
  end <- Sys.time()
  
  # Add raw text to dataframe
  imagesOCR$text[i] <- ocr_list$description
  
  # Add OCR duration (in seconds), & text-lines per image, filename, filesize (in MB)
  imagesOCR$ocr_duration[i] <- as.integer(end) - as.integer(start)
  imagesOCR$line_count[i] <- str_count(ocr_list$description, "\n+")
  imagesOCR$image[i] <- imagelist[i]
  imagesOCR$imagesize_MB[i] <- round(file.info(paste0("images/",
                                                      imagelist[i]))$size
                                     / 1000000, 2)
  
  # show progress
  print(paste(i, "- done -", Sys.time()))
  
  # rate limit to max of 240/min (Vision API limit = 1800/min)
  Sys.sleep(0.25)
 
}


# split text lines to separate columns
ocrText <- separate(imagesOCR, text,
                    into = paste0("Line", 
                                  seq(1:max(imagesOCR$line_count, na.rm = T))),
                    # into = seq(1:20),  # if need consistent NCOL
                    sep = "(\n)+",
                    extra = "merge", fill = "right")


# export CSV
write.csv(ocrText, 
          paste0("ocrText-",
                 gsub("\\s+|:", "", Sys.time()),
                 # image_dir,
                 ".csv"),
          na = "",
          row.names = F)
added scripts for Vision API/OCR 2019-11-13 23:18:33 +00:00			`# A script to use Google Cloud Vision to OCR/parse/mangle Collections label-images`
			`# Note!`
added OCR duration to output 2019-11-29 20:12:45 +00:00			`# - this may take >30 seconds per label-image`
added scripts for Vision API/OCR 2019-11-13 23:18:33 +00:00			`# - running >1000 API calls/month incurs a fee`
			`# (c) 2019 The Field Museum - MIT License (https://opensource.org/licenses/MIT)`
			`# https://github.com/fieldmuseum/Collections-OCR`

			`library(googleCloudVisionR) # NOTE - requires API Key / Service Account`
			`library(tidyr)`
			`library(readr)`
			`library(stringr)`
updated readme & libraries 2019-11-29 23:10:51 +00:00			`# library(magick)`
added scripts for Vision API/OCR 2019-11-13 23:18:33 +00:00
			`# get list of local JPG & JPEG image files [REVERT]`
			`imagelist <- list.files(path = "images/", pattern = ".jp\|.JP")`
			`imagenames <- gsub(".jp.\|.JP.", "", imagelist)`


			`# # Prompt user for input/output batch directory names?`
			`# image_dir <- readline("Paste the path for the image directory: ")`


			`# Retrieve OCR text ####`

			`# Setup table for OCRed text`
fixed textlist typo 2019-11-29 16:52:44 +00:00			`imagesOCR <- data.frame("image" = rep("", NROW(imagelist)),`
added OCR duration to output 2019-11-29 20:12:45 +00:00			`"imagesize_MB" = rep("", NROW(imagelist)),`
			`"ocr_start" = rep("", NROW(imagelist)),`
			`"ocr_duration" = rep("", NROW(imagelist)),`
fixed textlist typo 2019-11-29 16:52:44 +00:00			`"line_count" = rep("", NROW(imagelist)),`
			`"text" = rep("", NROW(imagelist)),`
added scripts for Vision API/OCR 2019-11-13 23:18:33 +00:00			`stringsAsFactors = F)`

			`imagesOCR$line_count <- as.integer(imagesOCR$line_count)`


			`# setup output dir`
			`# # add image_dir if use prompt above`
			`if (!dir.exists("ocr_text")) { # paste0(image_dir, "_out")`
			`dir.create("ocr_text") # paste0(image_dir, "_out")`
added OCR duration to output 2019-11-29 20:12:45 +00:00			`print("output directory created")`
added scripts for Vision API/OCR 2019-11-13 23:18:33 +00:00			`} else {`
added OCR duration to output 2019-11-29 20:12:45 +00:00			`print("output directory already exists")`
added scripts for Vision API/OCR 2019-11-13 23:18:33 +00:00			`}`


			`# Loop through each label-image`
			`for (i in 1:NROW(imagelist)) {`

			`# # If files are over 20MB, uncomment this to lower quality + avoid error?`
			`# ### NOTE! This will overwrite image with lower-quality file.`
			`#`
			`# if (file.info(paste0("images/", imagelist[i]))$size > 20000000) {`
			`# image_write(image_read(paste0("images/", imagelist[i])),`
			`# path = paste0("images/", imagelist[i]),`
			`# quality = 80)`

			`# OCR image`
added OCR duration to output 2019-11-29 20:12:45 +00:00			`# ### NOTE! This can take over ~30s per image`
			`print(paste(i, "- starting OCR -", Sys.time()))`

			`imagesOCR$ocr_start[i] <- as.character(Sys.time())`
			`start <- Sys.time()`

added scripts for Vision API/OCR 2019-11-13 23:18:33 +00:00			`ocr_list <- gcv_get_image_annotations(imagePaths = paste0("images/", imagelist[i]),`
added OCR duration to output 2019-11-29 20:12:45 +00:00			`feature = "DOCUMENT_TEXT_DETECTION") #,`
			`# savePath = paste0("ocr_text/",`
			`# imagenames[i], "_text.csv"))`

			`print(paste(i, "- finishing OCR -", Sys.time()))`
			`end <- Sys.time()`
added scripts for Vision API/OCR 2019-11-13 23:18:33 +00:00
			`# Add raw text to dataframe`
added OCR duration to output 2019-11-29 20:12:45 +00:00			`imagesOCR$text[i] <- ocr_list$description`
added scripts for Vision API/OCR 2019-11-13 23:18:33 +00:00
added OCR duration to output 2019-11-29 20:12:45 +00:00			`# Add OCR duration (in seconds), & text-lines per image, filename, filesize (in MB)`
			`imagesOCR$ocr_duration[i] <- as.integer(end) - as.integer(start)`
			`imagesOCR$line_count[i] <- str_count(ocr_list$description, "\n+")`
added scripts for Vision API/OCR 2019-11-13 23:18:33 +00:00			`imagesOCR$image[i] <- imagelist[i]`
added OCR duration to output 2019-11-29 20:12:45 +00:00			`imagesOCR$imagesize_MB[i] <- round(file.info(paste0("images/",`
			`imagelist[i]))$size`
			`/ 1000000, 2)`
added scripts for Vision API/OCR 2019-11-13 23:18:33 +00:00
			`# show progress`
added OCR duration to output 2019-11-29 20:12:45 +00:00			`print(paste(i, "- done -", Sys.time()))`
added scripts for Vision API/OCR 2019-11-13 23:18:33 +00:00
			`# rate limit to max of 240/min (Vision API limit = 1800/min)`
			`Sys.sleep(0.25)`

			`}`


			`# split text lines to separate columns`
			`ocrText <- separate(imagesOCR, text,`
			`into = paste0("Line",`
			`seq(1:max(imagesOCR$line_count, na.rm = T))),`
			`# into = seq(1:20), # if need consistent NCOL`
			`sep = "(\n)+",`
			`extra = "merge", fill = "right")`


			`# export CSV`
			`write.csv(ocrText,`
			`paste0("ocrText-",`
			`gsub("\\s+\|:", "", Sys.time()),`
			`# image_dir,`
			`".csv"),`
			`na = "",`
			`row.names = F)`