added scripts for Vision API/OCR

2019-11-13 17:18:33 -06:00 · 2019-11-13 17:18:33 -06:00 · 20cedc6b19
parent 4cdd4f46ec
commit 20cedc6b19
3 changed files with 220 additions and 1 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,6 +2,9 @@
 .Rhistory
 .RData
 .Ruserdata
+.Renviron

 *.jpg
 *.csv
+images/*.*
+ocr_text/*.*
--- a/ocrCloudVision.R
+++ b/ocrCloudVision.R
@ -0,0 +1,93 @@
+# A script to use Google Cloud Vision to OCR/parse/mangle Collections label-images
+# Note!
+#   - this may take a few seconds per label-image
+#   - running >1000 API calls/month incurs a fee
+# (c) 2019 The Field Museum - MIT License (https://opensource.org/licenses/MIT)
+# https://github.com/fieldmuseum/Collections-OCR
+
+library(googleCloudVisionR)  # NOTE - requires API Key / Service Account
+library(tidyr)
+library(readr)
+library(stringr)
+library(magick)
+
+# get list of local JPG & JPEG image files [REVERT]
+imagelist <- list.files(path = "images/", pattern = ".jp|.JP")
+imagenames <- gsub(".jp.*|.JP.*", "", imagelist)
+
+
+# # Prompt user for input/output batch directory names?
+# image_dir <- readline("Paste the path for the image directory: ")
+
+
+# Retrieve OCR text ####
+
+# Setup table for OCRed text
+imagesOCR <- data.frame("image" = rep("", NROW(textlist)),
+                        "line_count" = rep("", NROW(textlist)),
+                        "text" = rep("", NROW(textlist)),
+                        stringsAsFactors = F)
+
+imagesOCR$line_count <- as.integer(imagesOCR$line_count)
+
+
+# setup output dir
+# # add image_dir if use prompt above
+if (!dir.exists("ocr_text")) {  # paste0(image_dir, "_out")
+  dir.create("ocr_text")  # paste0(image_dir, "_out")
+} else {
+  print("output directory exists")
+}
+
+
+# Loop through each label-image
+for (i in 1:NROW(imagelist)) {
+  
+  # # If files are over 20MB, uncomment this to lower quality + avoid error?
+  # ### NOTE! This will overwrite image with lower-quality file.
+  # 
+  # if (file.info(paste0("images/", imagelist[i]))$size > 20000000) {
+  #   image_write(image_read(paste0("images/", imagelist[i])),
+  #               path = paste0("images/", imagelist[i]),
+  #               quality = 80)
+
+  # OCR image
+  # CHECK/FIX THIS FXN ####
+  ocr_list <- gcv_get_image_annotations(imagePaths = paste0("images/", imagelist[i]),
+                                        feature = "DOCUMENT_TEXT_DETECTION",
+                                        savePath = paste0("ocr_text/", 
+                                                          imagenames[i], "_text.csv"))
+  
+  # Add raw text to dataframe
+  imagesOCR$text[i] <- read_file(ocr_list$local_path)  # CHECK/FIX THIS PATH ####
+  
+  # Add filename & count of lines in row
+  imagesOCR$image[i] <- imagelist[i]
+  imagesOCR$line_count[i] <- str_count(ocr_list$local_path, "\n+")
+  
+  # show progress
+  print(paste(i, " - ", Sys.time()))
+  
+  # rate limit to max of 240/min (Vision API limit = 1800/min)
+  Sys.sleep(0.25)
+ 
+}
+
+
+# split text lines to separate columns
+ocrText <- separate(imagesOCR, text,
+                    into = paste0("Line", 
+                                  seq(1:max(imagesOCR$line_count, na.rm = T))),
+                    # into = seq(1:20),  # if need consistent NCOL
+                    sep = "(\n)+",
+                    extra = "merge", fill = "right")
+
+
+# export CSV
+write.csv(ocrText, 
+          paste0("ocrText-",
+                 gsub("\\s+|:", "", Sys.time()),
+                 # image_dir,
+                 ".csv"),
+          na = "",
+          row.names = F)
--- a/ocrGoogleDrive.R
+++ b/ocrGoogleDrive.R
@ -0,0 +1,123 @@
+# A script to use Google apps to OCR/parse/mangle Collections label-images
+# Note - this may take a few seconds per label-image
+# (c) 2019 The Field Museum - MIT License (https://opensource.org/licenses/MIT)
+# https://github.com/fieldmuseum/Collections-OCR
+
+library(googledrive)
+library(tidyr)
+library(readr)
+library(stringr)
+
+
+# get list of local JPG & JPEG image files [REVERT]
+imagelist <- list.files(path = "images/", pattern = ".jp|.JP")
+imagenames <- gsub(".jp.*|.JP.*", "", imagelist)
+
+
+# NOTE - update path to appropriate google folder
+googleFolder <- "https://drive.google.com/drive/folders/1fOI5JC1naQtfBZ2mXlWFlBOq2bKN17KA"
+# googleFolder <- readline("Paste the URL to a googledrive here: ")
+
+
+# Upload & OCR ####
+
+# Loop through each label-image
+for (i in 1:NROW(imagelist)) {
+  
+  # Setup Google Doc for image
+  drive_upload(media = paste0("images/", imagelist[i]),
+               path = as_id(googleFolder),
+               name = paste0(imagenames[i], "_text"), 
+               type = "document",
+               overwrite = FALSE)
+  
+  print(paste(i, " - ", Sys.time()))
+  
+}
+
+
+# get list of OCR text files
+filelist <- drive_ls(path = as_id(googleFolder),
+                     recursive = FALSE)
+
+textlist <- filelist[grepl("_text", filelist$name)==TRUE,]
+
+
+# Retrieve OCR text ####
+
+# Setup table for OCRed text
+imagesOCR <- data.frame("image" = rep("", NROW(textlist)),
+                        "line_count" = rep("", NROW(textlist)),
+                        "text" = rep("", NROW(textlist)),
+                        stringsAsFactors = F)
+
+imagesOCR$line_count <- as.integer(imagesOCR$line_count)
+
+if (!dir.exists("ocr_text")) {
+  dir.create("ocr_text")
+} else {
+  print("'ocr_text' directory exists")
+}
+
+# Download the OCR'ed label-images
+for (i in 1:NROW(textlist)) {
+  
+  # Setup Google Doc for image
+  dllist <- drive_download(file = as_id(textlist$id[i]),
+                           path = paste0("ocr_text/", textlist$name),
+                           type = "txt",
+                           overwrite = FALSE)
+  
+  # OCR the image to text
+  imagesOCR$text[i] <- read_file(dllist$local_path)
+  
+  # include filename & count of lines in row
+  imagesOCR$image[i] <- imagelist[i]
+  imagesOCR$line_count[i] <- str_count(ocrText, "\n+")
+  
+  # show progress
+  print(paste(i, " - ", Sys.time()))
+  
+}
+
+
+# # loop through each label-image
+# for (i in 1:NROW(imagelist)) {
+# 
+#   # # Setup Google Doc for image
+#   # drive_put(media = "images/PE78981_label.jpg",
+#   #           path = as_id("https://drive.google.com/drive/folders/1fOI5JC1naQtfBZ2mXlWFlBOq2bKN17KA"),
+#   #           name = "test_text", 
+#   #           type = "document")
+#   
+#   # OCR the image to text
+#   ocrText <- image_read(paste0("images/", imagelist[i])) %>%
+#     image_ocr(language = c("eng", "lat", "deu"))
+#   imagesOCR$text[i] <- ocrText
+#   
+#   # include filename & count of lines in row
+#   imagesOCR$image[i] <- imagelist[i]
+#   imagesOCR$line_count[i] <- str_count(ocrText, "\n+")
+#   
+#   # show progress
+#   print(paste(i, " - ", Sys.time()))
+#   
+# }
+
+
+# split text lines to separate columns
+ocrText <- separate(imagesOCR, text,
+                    into = paste0("Line", 
+                                  seq(1:max(imagesOCR$line_count, na.rm = T))),
+                    # into = seq(1:20),  # if need consistent NCOL
+                    sep = "(\n)+",
+                    extra = "merge", fill = "right")
+
+
+# export CSV
+write.csv(ocrText, 
+          paste0("ocrText-",
+                 gsub("\\s+|:", "", Sys.time()),
+                 ".csv"),
+          na = "",
+          row.names = F)