Collections-OCR/ocrGoogleDrive.R

# A script to use Google apps to OCR/parse/mangle Collections label-images
# Note - this may take a few seconds per label-image
# (c) 2019 The Field Museum - MIT License (https://opensource.org/licenses/MIT)
# https://github.com/fieldmuseum/Collections-OCR

library(googledrive)
library(tidyr)
library(readr)
library(stringr)


# get list of local JPG & JPEG image files [REVERT]
imagelist <- list.files(path = "images/", pattern = ".jp|.JP")
imagenames <- gsub(".jp.*|.JP.*", "", imagelist)


# NOTE - update path to appropriate google folder
googleFolder <- "https://drive.google.com/drive/folders/1fOI5JC1naQtfBZ2mXlWFlBOq2bKN17KA"
# googleFolder <- readline("Paste the URL to a googledrive here: ")


# Upload & OCR ####

# Loop through each label-image
for (i in 1:NROW(imagelist)) {
  
  # Setup Google Doc for image
  drive_upload(media = paste0("images/", imagelist[i]),
               path = as_id(googleFolder),
               name = paste0(imagenames[i], "_text"), 
               type = "document",
               overwrite = FALSE)
  
  print(paste(i, " - ", Sys.time()))
  
}


# get list of OCR text files
filelist <- drive_ls(path = as_id(googleFolder),
                     recursive = FALSE)

textlist <- filelist[grepl("_text", filelist$name)==TRUE,]


# Retrieve OCR text ####

# Setup table for OCRed text
imagesOCR <- data.frame("image" = rep("", NROW(textlist)),
                        "line_count" = rep("", NROW(textlist)),
                        "text" = rep("", NROW(textlist)),
                        stringsAsFactors = F)

imagesOCR$line_count <- as.integer(imagesOCR$line_count)

if (!dir.exists("ocr_text")) {
  dir.create("ocr_text")
} else {
  print("'ocr_text' directory exists")
}

# Download the OCR'ed label-images
for (i in 1:NROW(textlist)) {
  
  # Setup Google Doc for image
  dllist <- drive_download(file = as_id(textlist$id[i]),
                           path = paste0("ocr_text/", textlist$name),
                           type = "txt",
                           overwrite = FALSE)
  
  # OCR the image to text
  imagesOCR$text[i] <- read_file(dllist$local_path)
  
  # include filename & count of lines in row
  imagesOCR$image[i] <- imagelist[i]
  imagesOCR$line_count[i] <- str_count(ocrText, "\n+")
  
  # show progress
  print(paste(i, " - ", Sys.time()))
  
}


# # loop through each label-image
# for (i in 1:NROW(imagelist)) {
# 
#   # # Setup Google Doc for image
#   # drive_put(media = "images/PE78981_label.jpg",
#   #           path = as_id("https://drive.google.com/drive/folders/1fOI5JC1naQtfBZ2mXlWFlBOq2bKN17KA"),
#   #           name = "test_text", 
#   #           type = "document")
#   
#   # OCR the image to text
#   ocrText <- image_read(paste0("images/", imagelist[i])) %>%
#     image_ocr(language = c("eng", "lat", "deu"))
#   imagesOCR$text[i] <- ocrText
#   
#   # include filename & count of lines in row
#   imagesOCR$image[i] <- imagelist[i]
#   imagesOCR$line_count[i] <- str_count(ocrText, "\n+")
#   
#   # show progress
#   print(paste(i, " - ", Sys.time()))
#   
# }


# split text lines to separate columns
ocrText <- separate(imagesOCR, text,
                    into = paste0("Line", 
                                  seq(1:max(imagesOCR$line_count, na.rm = T))),
                    # into = seq(1:20),  # if need consistent NCOL
                    sep = "(\n)+",
                    extra = "merge", fill = "right")


# export CSV
write.csv(ocrText, 
          paste0("ocrText-",
                 gsub("\\s+|:", "", Sys.time()),
                 ".csv"),
          na = "",
          row.names = F)
added scripts for Vision API/OCR 2019-11-13 23:18:33 +00:00			`# A script to use Google apps to OCR/parse/mangle Collections label-images`
			`# Note - this may take a few seconds per label-image`
			`# (c) 2019 The Field Museum - MIT License (https://opensource.org/licenses/MIT)`
			`# https://github.com/fieldmuseum/Collections-OCR`

			`library(googledrive)`
			`library(tidyr)`
			`library(readr)`
			`library(stringr)`


			`# get list of local JPG & JPEG image files [REVERT]`
			`imagelist <- list.files(path = "images/", pattern = ".jp\|.JP")`
			`imagenames <- gsub(".jp.\|.JP.", "", imagelist)`


			`# NOTE - update path to appropriate google folder`
			`googleFolder <- "https://drive.google.com/drive/folders/1fOI5JC1naQtfBZ2mXlWFlBOq2bKN17KA"`
			`# googleFolder <- readline("Paste the URL to a googledrive here: ")`


			`# Upload & OCR ####`

			`# Loop through each label-image`
			`for (i in 1:NROW(imagelist)) {`

			`# Setup Google Doc for image`
			`drive_upload(media = paste0("images/", imagelist[i]),`
			`path = as_id(googleFolder),`
			`name = paste0(imagenames[i], "_text"),`
			`type = "document",`
			`overwrite = FALSE)`

			`print(paste(i, " - ", Sys.time()))`

			`}`


			`# get list of OCR text files`
			`filelist <- drive_ls(path = as_id(googleFolder),`
			`recursive = FALSE)`

			`textlist <- filelist[grepl("_text", filelist$name)==TRUE,]`


			`# Retrieve OCR text ####`

			`# Setup table for OCRed text`
			`imagesOCR <- data.frame("image" = rep("", NROW(textlist)),`
			`"line_count" = rep("", NROW(textlist)),`
			`"text" = rep("", NROW(textlist)),`
			`stringsAsFactors = F)`

			`imagesOCR$line_count <- as.integer(imagesOCR$line_count)`

			`if (!dir.exists("ocr_text")) {`
			`dir.create("ocr_text")`
			`} else {`
			`print("'ocr_text' directory exists")`
			`}`

			`# Download the OCR'ed label-images`
			`for (i in 1:NROW(textlist)) {`

			`# Setup Google Doc for image`
			`dllist <- drive_download(file = as_id(textlist$id[i]),`
			`path = paste0("ocr_text/", textlist$name),`
			`type = "txt",`
			`overwrite = FALSE)`

			`# OCR the image to text`
			`imagesOCR$text[i] <- read_file(dllist$local_path)`

			`# include filename & count of lines in row`
			`imagesOCR$image[i] <- imagelist[i]`
			`imagesOCR$line_count[i] <- str_count(ocrText, "\n+")`

			`# show progress`
			`print(paste(i, " - ", Sys.time()))`

			`}`


			`# # loop through each label-image`
			`# for (i in 1:NROW(imagelist)) {`
			`#`
			`# # # Setup Google Doc for image`
			`# # drive_put(media = "images/PE78981_label.jpg",`
			`# # path = as_id("https://drive.google.com/drive/folders/1fOI5JC1naQtfBZ2mXlWFlBOq2bKN17KA"),`
			`# # name = "test_text",`
			`# # type = "document")`
			`#`
			`# # OCR the image to text`
			`# ocrText <- image_read(paste0("images/", imagelist[i])) %>%`
			`# image_ocr(language = c("eng", "lat", "deu"))`
			`# imagesOCR$text[i] <- ocrText`
			`#`
			`# # include filename & count of lines in row`
			`# imagesOCR$image[i] <- imagelist[i]`
			`# imagesOCR$line_count[i] <- str_count(ocrText, "\n+")`
			`#`
			`# # show progress`
			`# print(paste(i, " - ", Sys.time()))`
			`#`
			`# }`


			`# split text lines to separate columns`
			`ocrText <- separate(imagesOCR, text,`
			`into = paste0("Line",`
			`seq(1:max(imagesOCR$line_count, na.rm = T))),`
			`# into = seq(1:20), # if need consistent NCOL`
			`sep = "(\n)+",`
			`extra = "merge", fill = "right")`


			`# export CSV`
			`write.csv(ocrText,`
			`paste0("ocrText-",`
			`gsub("\\s+\|:", "", Sys.time()),`
			`".csv"),`
			`na = "",`
			`row.names = F)`