# A script to use Google Cloud Vision to OCR/parse/mangle Collections label-images # Note! # - this may take a few seconds per label-image # - running >1000 API calls/month incurs a fee # (c) 2019 The Field Museum - MIT License (https://opensource.org/licenses/MIT) # https://github.com/fieldmuseum/Collections-OCR library(googleCloudVisionR) # NOTE - requires API Key / Service Account library(tidyr) library(readr) library(stringr) library(magick) # get list of local JPG & JPEG image files [REVERT] imagelist <- list.files(path = "images/", pattern = ".jp|.JP") imagenames <- gsub(".jp.*|.JP.*", "", imagelist) # # Prompt user for input/output batch directory names? # image_dir <- readline("Paste the path for the image directory: ") # Retrieve OCR text #### # Setup table for OCRed text imagesOCR <- data.frame("image" = rep("", NROW(imagelist)), "line_count" = rep("", NROW(imagelist)), "text" = rep("", NROW(imagelist)), stringsAsFactors = F) imagesOCR$line_count <- as.integer(imagesOCR$line_count) # setup output dir # # add image_dir if use prompt above if (!dir.exists("ocr_text")) { # paste0(image_dir, "_out") dir.create("ocr_text") # paste0(image_dir, "_out") } else { print("output directory exists") } # Loop through each label-image for (i in 1:NROW(imagelist)) { # # If files are over 20MB, uncomment this to lower quality + avoid error? # ### NOTE! This will overwrite image with lower-quality file. # # if (file.info(paste0("images/", imagelist[i]))$size > 20000000) { # image_write(image_read(paste0("images/", imagelist[i])), # path = paste0("images/", imagelist[i]), # quality = 80) # OCR image # CHECK/FIX THIS FXN #### ocr_list <- gcv_get_image_annotations(imagePaths = paste0("images/", imagelist[i]), feature = "DOCUMENT_TEXT_DETECTION", savePath = paste0("ocr_text/", imagenames[i], "_text.csv")) # Add raw text to dataframe imagesOCR$text[i] <- read_file(ocr_list$local_path) # CHECK/FIX THIS PATH #### # Add filename & count of lines in row imagesOCR$image[i] <- imagelist[i] imagesOCR$line_count[i] <- str_count(ocr_list$local_path, "\n+") # show progress print(paste(i, " - ", Sys.time())) # rate limit to max of 240/min (Vision API limit = 1800/min) Sys.sleep(0.25) } # split text lines to separate columns ocrText <- separate(imagesOCR, text, into = paste0("Line", seq(1:max(imagesOCR$line_count, na.rm = T))), # into = seq(1:20), # if need consistent NCOL sep = "(\n)+", extra = "merge", fill = "right") # export CSV write.csv(ocrText, paste0("ocrText-", gsub("\\s+|:", "", Sys.time()), # image_dir, ".csv"), na = "", row.names = F)