From 8c33418d913531631fc51e4d7cead5991cddeb29 Mon Sep 17 00:00:00 2001 From: tigp <24852530+tigp@users.noreply.github.com> Date: Sat, 23 Jun 2018 14:59:24 +0200 Subject: [PATCH] WIP - Respond with document info --- verify/verify.go | 73 +++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 66 insertions(+), 7 deletions(-) diff --git a/verify/verify.go b/verify/verify.go index b664ca0..b2c2fc1 100644 --- a/verify/verify.go +++ b/verify/verify.go @@ -18,6 +18,9 @@ import ( "github.com/digitorus/pkcs7" "github.com/digitorus/timestamp" + "strconv" + "strings" + "golang.org/x/crypto/ocsp" ) @@ -312,21 +315,24 @@ func Reader(file io.ReaderAt, size int64) (apiResp *Response, err error) { return } +// DocumentInfo contains document information type DocumentInfo struct { Author, - CreationDate, Creator, Hash, - Keywords, - ModDate, Name, - Pages, Permission, Producer, Subject, Title string + + Pages int + Keywords []string + ModDate, + CreationDate time.Time } +// getDocumentInfo parses document information func getDocumentInfo(v pdf.Value, documentInfo *DocumentInfo) { keys := []string{"Author", "CreationDate", "Creator", "Hash", "Keywords", "ModDate", "Name", "Pages", "Permission", "Producer", "Subject", "Title"} @@ -334,14 +340,67 @@ func getDocumentInfo(v pdf.Value, documentInfo *DocumentInfo) { for _, key := range keys { value := v.Key(key) if !value.IsNull() { + // get string value valueStr := value.Text() - t := reflect.ValueOf(documentInfo).Elem() - val := t.FieldByName(key) - val.Set(reflect.ValueOf(valueStr)) + + // get struct field + elem := reflect.ValueOf(documentInfo).Elem() + field := elem.FieldByName(key) + + switch key { + // parse dates + case "CreationDate", "ModDate": + t, _ := parseDate(valueStr) + field.Set(reflect.ValueOf(t)) + // parse pages + case "Pages": + i, _ := strconv.Atoi(valueStr) + documentInfo.Pages = i + case "Keywords": + documentInfo.Keywords = parseKeywords(valueStr) + default: + field.Set(reflect.ValueOf(valueStr)) + } } } } +// parseDate parses pdf formatted dates +func parseDate(v string) (time.Time, error) { + //PDF Date Format + //(D:YYYYMMDDHHmmSSOHH'mm') + // + //where + // + //YYYY is the year + //MM is the month + //DD is the day (01-31) + //HH is the hour (00-23) + //mm is the minute (00-59) + //SS is the second (00-59) + //O is the relationship of local time to Universal Time (UT), denoted by one of the characters +, -, or Z (see below) + //HH followed by ' is the absolute value of the offset from UT in hours (00-23) + //mm followed by ' is the absolute value of the offset from UT in minutes (00-59) + + //2006-01-02T15:04:05Z07:00 + //(D:YYYYMMDDHHmmSSOHH'mm') + return time.Parse("D:20060102150405Z07'00'", v) +} + +// parseKeywords parses keywords pdf meta data +func parseKeywords(value string) []string { + //keywords must be separated by commas or semicolons or could be just separated with spaces, after the semicolon could be a space + //https://stackoverflow.com/questions/44608608/the-separator-between-keywords-in-pdf-meta-data + separators := []string{", ", ": ", ",", ":", " "} + for _, s := range separators { + if strings.Contains(value, s) { + return strings.Split(value, s) + } + } + + return []string{value} +} + func walk(t pdf.Value, pad int) { for _, k := range t.Keys() { v := t.Key(k)