Collections-OCR/ocrCloudVision.R

109 lines
3.7 KiB
R
Raw Permalink Normal View History

2019-11-13 23:18:33 +00:00
# A script to use Google Cloud Vision to OCR/parse/mangle Collections label-images
# Note!
2019-11-29 20:12:45 +00:00
# - this may take >30 seconds per label-image
2019-11-13 23:18:33 +00:00
# - running >1000 API calls/month incurs a fee
# (c) 2019 The Field Museum - MIT License (https://opensource.org/licenses/MIT)
# https://github.com/fieldmuseum/Collections-OCR
library(googleCloudVisionR) # NOTE - requires API Key / Service Account
library(tidyr)
library(readr)
library(stringr)
2019-11-29 23:10:51 +00:00
# library(magick)
2019-11-13 23:18:33 +00:00
# get list of local JPG & JPEG image files [REVERT]
imagelist <- list.files(path = "images/", pattern = ".jp|.JP")
imagenames <- gsub(".jp.*|.JP.*", "", imagelist)
# # Prompt user for input/output batch directory names?
# image_dir <- readline("Paste the path for the image directory: ")
# Retrieve OCR text ####
# Setup table for OCRed text
2019-11-29 16:52:44 +00:00
imagesOCR <- data.frame("image" = rep("", NROW(imagelist)),
2019-11-29 20:12:45 +00:00
"imagesize_MB" = rep("", NROW(imagelist)),
"ocr_start" = rep("", NROW(imagelist)),
"ocr_duration" = rep("", NROW(imagelist)),
2019-11-29 16:52:44 +00:00
"line_count" = rep("", NROW(imagelist)),
"text" = rep("", NROW(imagelist)),
2019-11-13 23:18:33 +00:00
stringsAsFactors = F)
imagesOCR$line_count <- as.integer(imagesOCR$line_count)
# setup output dir
# # add image_dir if use prompt above
if (!dir.exists("ocr_text")) { # paste0(image_dir, "_out")
dir.create("ocr_text") # paste0(image_dir, "_out")
2019-11-29 20:12:45 +00:00
print("output directory created")
2019-11-13 23:18:33 +00:00
} else {
2019-11-29 20:12:45 +00:00
print("output directory already exists")
2019-11-13 23:18:33 +00:00
}
# Loop through each label-image
for (i in 1:NROW(imagelist)) {
# # If files are over 20MB, uncomment this to lower quality + avoid error?
# ### NOTE! This will overwrite image with lower-quality file.
#
# if (file.info(paste0("images/", imagelist[i]))$size > 20000000) {
# image_write(image_read(paste0("images/", imagelist[i])),
# path = paste0("images/", imagelist[i]),
# quality = 80)
# OCR image
2019-11-29 20:12:45 +00:00
# ### NOTE! This can take over ~30s per image
print(paste(i, "- starting OCR -", Sys.time()))
imagesOCR$ocr_start[i] <- as.character(Sys.time())
start <- Sys.time()
2019-11-13 23:18:33 +00:00
ocr_list <- gcv_get_image_annotations(imagePaths = paste0("images/", imagelist[i]),
2019-11-29 20:12:45 +00:00
feature = "DOCUMENT_TEXT_DETECTION") #,
# savePath = paste0("ocr_text/",
# imagenames[i], "_text.csv"))
print(paste(i, "- finishing OCR -", Sys.time()))
end <- Sys.time()
2019-11-13 23:18:33 +00:00
# Add raw text to dataframe
2019-11-29 20:12:45 +00:00
imagesOCR$text[i] <- ocr_list$description
2019-11-13 23:18:33 +00:00
2019-11-29 20:12:45 +00:00
# Add OCR duration (in seconds), & text-lines per image, filename, filesize (in MB)
imagesOCR$ocr_duration[i] <- as.integer(end) - as.integer(start)
imagesOCR$line_count[i] <- str_count(ocr_list$description, "\n+")
2019-11-13 23:18:33 +00:00
imagesOCR$image[i] <- imagelist[i]
2019-11-29 20:12:45 +00:00
imagesOCR$imagesize_MB[i] <- round(file.info(paste0("images/",
imagelist[i]))$size
/ 1000000, 2)
2019-11-13 23:18:33 +00:00
# show progress
2019-11-29 20:12:45 +00:00
print(paste(i, "- done -", Sys.time()))
2019-11-13 23:18:33 +00:00
# rate limit to max of 240/min (Vision API limit = 1800/min)
Sys.sleep(0.25)
}
# split text lines to separate columns
ocrText <- separate(imagesOCR, text,
into = paste0("Line",
seq(1:max(imagesOCR$line_count, na.rm = T))),
# into = seq(1:20), # if need consistent NCOL
sep = "(\n)+",
extra = "merge", fill = "right")
# export CSV
write.csv(ocrText,
paste0("ocrText-",
gsub("\\s+|:", "", Sys.time()),
# image_dir,
".csv"),
na = "",
row.names = F)