From 8c33418d913531631fc51e4d7cead5991cddeb29 Mon Sep 17 00:00:00 2001
From: tigp <24852530+tigp@users.noreply.github.com>
Date: Sat, 23 Jun 2018 14:59:24 +0200
Subject: [PATCH] WIP - Respond with document info

---
 verify/verify.go | 73 +++++++++++++++++++++++++++++++++++++++++++-----
 1 file changed, 66 insertions(+), 7 deletions(-)

diff --git a/verify/verify.go b/verify/verify.go
index b664ca0..b2c2fc1 100644
--- a/verify/verify.go
+++ b/verify/verify.go
@@ -18,6 +18,9 @@ import (
 	"github.com/digitorus/pkcs7"
 	"github.com/digitorus/timestamp"
 
+	"strconv"
+	"strings"
+
 	"golang.org/x/crypto/ocsp"
 )
 
@@ -312,21 +315,24 @@ func Reader(file io.ReaderAt, size int64) (apiResp *Response, err error) {
 	return
 }
 
+// DocumentInfo contains document information
 type DocumentInfo struct {
 	Author,
-	CreationDate,
 	Creator,
 	Hash,
-	Keywords,
-	ModDate,
 	Name,
-	Pages,
 	Permission,
 	Producer,
 	Subject,
 	Title string
+
+	Pages    int
+	Keywords []string
+	ModDate,
+	CreationDate time.Time
 }
 
+// getDocumentInfo parses document information
 func getDocumentInfo(v pdf.Value, documentInfo *DocumentInfo) {
 	keys := []string{"Author", "CreationDate", "Creator", "Hash", "Keywords", "ModDate",
 		"Name", "Pages", "Permission", "Producer", "Subject", "Title"}
@@ -334,14 +340,67 @@ func getDocumentInfo(v pdf.Value, documentInfo *DocumentInfo) {
 	for _, key := range keys {
 		value := v.Key(key)
 		if !value.IsNull() {
+			// get string value
 			valueStr := value.Text()
-			t := reflect.ValueOf(documentInfo).Elem()
-			val := t.FieldByName(key)
-			val.Set(reflect.ValueOf(valueStr))
+
+			// get struct field
+			elem := reflect.ValueOf(documentInfo).Elem()
+			field := elem.FieldByName(key)
+
+			switch key {
+			// parse dates
+			case "CreationDate", "ModDate":
+				t, _ := parseDate(valueStr)
+				field.Set(reflect.ValueOf(t))
+			// parse pages
+			case "Pages":
+				i, _ := strconv.Atoi(valueStr)
+				documentInfo.Pages = i
+			case "Keywords":
+				documentInfo.Keywords = parseKeywords(valueStr)
+			default:
+				field.Set(reflect.ValueOf(valueStr))
+			}
 		}
 	}
 }
 
+// parseDate parses pdf formatted dates
+func parseDate(v string) (time.Time, error) {
+	//PDF Date Format
+	//(D:YYYYMMDDHHmmSSOHH'mm')
+	//
+	//where
+	//
+	//YYYY is the year
+	//MM is the month
+	//DD is the day (01-31)
+	//HH is the hour (00-23)
+	//mm is the minute (00-59)
+	//SS is the second (00-59)
+	//O is the relationship of local time to Universal Time (UT), denoted by one of the characters +, -, or Z (see below)
+	//HH followed by ' is the absolute value of the offset from UT in hours (00-23)
+	//mm followed by ' is the absolute value of the offset from UT in minutes (00-59)
+
+	//2006-01-02T15:04:05Z07:00
+	//(D:YYYYMMDDHHmmSSOHH'mm')
+	return time.Parse("D:20060102150405Z07'00'", v)
+}
+
+// parseKeywords parses keywords pdf meta data
+func parseKeywords(value string) []string {
+	//keywords must be separated by commas or semicolons or could be just separated with spaces, after the semicolon could be a space
+	//https://stackoverflow.com/questions/44608608/the-separator-between-keywords-in-pdf-meta-data
+	separators := []string{", ", ": ", ",", ":", " "}
+	for _, s := range separators {
+		if strings.Contains(value, s) {
+			return strings.Split(value, s)
+		}
+	}
+
+	return []string{value}
+}
+
 func walk(t pdf.Value, pad int) {
 	for _, k := range t.Keys() {
 		v := t.Key(k)