From 6274b48693044b3cfb0bbc5670d6d866e604bf11 Mon Sep 17 00:00:00 2001
From: magpiedin <magpiedin@gmail.com>
Date: Fri, 29 Nov 2019 17:10:51 -0600
Subject: [PATCH] updated readme & libraries

---
 README.md        | 51 +++++++++++++++++++++++++++++++++++++++++++-----
 ocrCloudVision.R |  2 +-
 2 files changed, 47 insertions(+), 6 deletions(-)

diff --git a/README.md b/README.md
index b688bc0..122b2f4 100644
--- a/README.md
+++ b/README.md
@@ -1,13 +1,51 @@
 # Collections-OCR
-A script to batch collections label-images through OCR
+A few scripts that batches of collections label-images through OCR
 
-## Dependencies 
+## Google Cloud Vision API & `ocrCloudVision.R`
+### Dependencies 
+Make sure to install these libraries first:
+- `googleCloudVisionR` - to do OCR magic 
+- Other dependencies include `readr`, `tidyr`, `stringr` for data handling
+
+### How to run `ocrCloudVision.R`:
+Notes:
+- This currently uses Google's Cloud Vision API, which requires:
+  - Being aware of [pricing & quotas for the Google Vision API](https://cloud.google.com/vision/pricing)
+  - Setting up a project on Google Cloud Platform
+  - Authenticating your magine by setting up a Service account & key 
+    - Get help from the [cloudyr repo for `googleCloudVisionR`](https://cloudyr.github.io/googleCloudVisionR/)
+- This can takes over 30 seconds per label-image.
+  - Be mindful how many images you add to your "images" directory.
+  - Be mindful of your internet connection speed
+  - Keep image sizes under 20MB
+    (Overall, smaller image files transfer and process more quickly)
+- Output likely needs some [or many] follow-up/clean-up steps.
+  - Batch similar images together to streamline follow-up steps.
+
+To run the script:
+1. Add a folder named "images" to this script's directory
+2. Add the images (JPG & JPEG) you'd like to OCR to that directory
+3. Run the script (`Rscript ocrCloudVision.R`)
+
+### Output from `ocrCloudVision.R`:
+A CSV named "ocrText-[Date-time].csv", containing these columns:
+- **"image"** = filename for each JPG and JPEG
+- **"imagesize"** = filesize for each image (in MB)
+- **"ocr_start"** = start-date and time when an image was submitted to the Google Vision API
+- **"ocr_duration"** = duration (in seconds) of the OCR process
+- **"line_count"** = number of lines in each OCR transcription
+- **"Line1" - "Line[N]"** = text for each line in the OCR transcription of an image.
+  - the number of **"Line"** columns will match the maximum number of lines as needed.
+
+
+## Tesseract & `ocrMangle.R`
+### Dependencies 
 Make sure to install these libraries first:
 - `magick` - to read in image files
 - `tesseract` - to do OCR magic 
 - `stringr` - to split the OCR'ed lines to columns
 
-## How to run the script:
+### How to run `ocrMangle.R`:
 Notes:
 - This can takes over 10 seconds per label-image.
   - Be mindful how many images you add to your "images" directory.
@@ -20,10 +58,13 @@ To run the script:
 2. Add the images (JPG & JPEG) you'd like to OCR to that directory
 3. Run the script (`Rscript ocrMangle.R`)
 
-## Output
+### Output from `ocrMangle.R`:
 A CSV named "ocrText-[Date-time].csv", containing these columns:
 - **"image"** = filename for each JPG and JPEG
 - **"line_count"** = number of lines in each OCR transcription
-- **"Line1"** - "Line[N]" = text for each line in the OCR transcription.
+- **"Line1" - "Line[N]"** = text for each line in the OCR transcription.
   - the number of **"Line"** columns will match the maximum number of lines as needed.
 
+
+## Google Drive API & `ocrGoogleDrive.R`
+### This is drafty; might work for small batches, but needs work.
diff --git a/ocrCloudVision.R b/ocrCloudVision.R
index 4bc70ac..231a899 100644
--- a/ocrCloudVision.R
+++ b/ocrCloudVision.R
@@ -9,7 +9,7 @@ library(googleCloudVisionR)  # NOTE - requires API Key / Service Account
 library(tidyr)
 library(readr)
 library(stringr)
-library(magick)
+# library(magick)
 
 # get list of local JPG & JPEG image files [REVERT]
 imagelist <- list.files(path = "images/", pattern = ".jp|.JP")