diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..6c0da7c --- /dev/null +++ b/.gitignore @@ -0,0 +1,7 @@ +.Rproj.user +.Rhistory +.RData +.Ruserdata + +*.jpg +*.csv \ No newline at end of file diff --git a/Collections-OCR.Rproj b/Collections-OCR.Rproj new file mode 100644 index 0000000..8e3c2eb --- /dev/null +++ b/Collections-OCR.Rproj @@ -0,0 +1,13 @@ +Version: 1.0 + +RestoreWorkspace: Default +SaveWorkspace: Default +AlwaysSaveHistory: Default + +EnableCodeIndexing: Yes +UseSpacesForTab: Yes +NumSpacesForTab: 2 +Encoding: UTF-8 + +RnwWeave: Sweave +LaTeX: pdfLaTeX diff --git a/ocrMangle.R b/ocrMangle.R index 07097db..c62e106 100644 --- a/ocrMangle.R +++ b/ocrMangle.R @@ -3,7 +3,7 @@ # (c) 2019 The Field Museum - MIT License (https://opensource.org/licenses/MIT) # https://github.com/fieldmuseum/Collections-OCR - +library(tidyr) library(magick) library(stringr) library(tesseract) @@ -53,6 +53,7 @@ ocrText <- separate(imagesOCR, text, sep = "\n", extra = "merge", fill = "right") + # export CSV write.csv(ocrText, paste0("ocrText-",