whimsical-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From s...@apache.org
Subject [whimsy] branch master updated: Show metadata separately
Date Fri, 29 Nov 2019 08:59:36 GMT
This is an automated email from the ASF dual-hosted git repository.

sebb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/whimsy.git


The following commit(s) were added to refs/heads/master by this push:
     new 431d2ef  Show metadata separately
431d2ef is described below

commit 431d2ef7a86b1a73adab16f2f446dbc1a0279daf
Author: Sebb <sebb@apache.org>
AuthorDate: Fri Nov 29 08:59:15 2019 +0000

    Show metadata separately
---
 www/secretary/icla-parse.cgi | 15 +++++++++++++--
 www/secretary/iclaparser.rb  | 17 +++++++++++------
 2 files changed, 24 insertions(+), 8 deletions(-)

diff --git a/www/secretary/icla-parse.cgi b/www/secretary/icla-parse.cgi
index 418eecc..97d9e4d 100755
--- a/www/secretary/icla-parse.cgi
+++ b/www/secretary/icla-parse.cgi
@@ -29,8 +29,19 @@ begin
   
   path = message.find(iclaname).as_file.path
   
-  ICLAParser.parse(path).sort_by{|k,v| k.to_s }.each do |k,v|
-    puts "%-20s %s" % [k,v]
+  parsed = ICLAParser.parse(path)
+  parsed.sort_by{|k,v| k.to_s }.each do |k,v|
+    puts "%-20s %s" % [k,v] unless k == :_meta
+  end
+  puts ""
+  parsed[:_meta].sort_by{|k,v| k.to_s }.each do |k,v|
+    if k == 'info'
+      v.sort_by{|k,v| k.to_s }.each do |k,v|
+        puts "%-20s %s %s" % ['info',k,v]
+      end
+    else
+      puts "%-20s %s" % [k,v]
+    end
   end
   
 rescue Exception => e
diff --git a/www/secretary/iclaparser.rb b/www/secretary/iclaparser.rb
index 3984cbd..fb4ea8c 100644
--- a/www/secretary/iclaparser.rb
+++ b/www/secretary/iclaparser.rb
@@ -136,11 +136,16 @@ module ICLAParser
   # parse the PDF
   def self.parse(path)
     data=Hash.new
-    data[:dataSource] = {} # have we found anything
+    metadata = {}
+    data[:_meta] = metadata
+    metadata[:dataSource] = {} # have we found anything
     freetext = {} # gather the free text details
     debug={}
     begin
       reader = PDF::Reader.new(path)
+      %w(pdf_version info metadata page_count).each do |i|
+        metadata[i] = reader.public_send(i)
+      end
       reader.objects.each do |k,v|
         type = v[:Type] rescue nil
         subtype = v[:Subtype] rescue nil
@@ -158,7 +163,7 @@ module ICLAParser
                 # Entries may be duplicated, so use a hash to store them
                 id = rect.inspect+contents # if the rect and contents match, then they overwrite
each other
                 freetext[id] = {Contents: contents.strip, x: rect[0], y: rect[1]} 
-                data[:dataSource]['FreeText'] = true
+                metadata[:dataSource]['FreeText'] = true
               end
             else
               puts "warn: #{contents} Rect is #{rect.class} in #{path}"
@@ -183,7 +188,7 @@ module ICLAParser
                 if val.length > 0
                   data[canon_field_name(key)] = val.gsub("\x7F",'') # Not sure where these
originate
                 end
-                data[:dataSource]['Form'] = true
+                metadata[:dataSource]['Form'] = true
               end
             end
           end
@@ -204,7 +209,7 @@ module ICLAParser
               map{|v| v[:Contents]}.join(", ")
         end
       end
-      if data[:dataSource].size == 0 or ((data[:text].size rescue 0) <= 1 and data.size
< 3) # No annotations found or not useful
+      if metadata[:dataSource].size == 0 or ((data[:text].size rescue 0) <= 1 and data.size
< 3) # No annotations found or not useful
         page1 = nil # cache for page 1
         fontdict = Hash.new
         # Try looking for text sections instead
@@ -223,12 +228,12 @@ module ICLAParser
         lines = receiver.get_lines() # do we still need these?
         debug[:lines] = lines
         if text.length > 3
-          data[:dataSource]['Text'] = true
+          metadata[:dataSource]['Text'] = true
           data[:text] = text
         else
           page1.each_line.slice_before(/^\s+Full name:/).each_with_index do |lump, i|
             if i == 1 # starts with Full name
-              data[:dataSource]['Page'] = true
+              metadata[:dataSource]['Page'] = true
               # drop the postamble
               form = lump.slice_before(/^\S/).first
               # split into headers


Mime
View raw message