whimsical-commits mailing list archives

Site index · List index
Message view « Date » · « Thread »
Top « Date » · « Thread »
From s...@apache.org
Subject [whimsy] branch master updated: Draft code to parse ICLA PDFs
Date Tue, 26 Nov 2019 15:10:31 GMT
This is an automated email from the ASF dual-hosted git repository.

sebb pushed a commit to branch master
in repository https://gitbox.apache.org/repos/asf/whimsy.git


The following commit(s) were added to refs/heads/master by this push:
     new dc9d96b  Draft code to parse ICLA PDFs
dc9d96b is described below

commit dc9d96bb121b53e06476943701735df95a522d63
Author: Sebb <sebb@apache.org>
AuthorDate: Tue Nov 26 15:10:27 2019 +0000

    Draft code to parse ICLA PDFs
    
    Needs to be tidied up and wired into workbench somehow
---
 www/secretary/icla-parse.cgi |  38 +++++++
 www/secretary/iclaparser.rb  | 257 +++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 295 insertions(+)

diff --git a/www/secretary/icla-parse.cgi b/www/secretary/icla-parse.cgi
new file mode 100755
index 0000000..1f139db
--- /dev/null
+++ b/www/secretary/icla-parse.cgi
@@ -0,0 +1,38 @@
+#!/usr/bin/env ruby
+
+$LOAD_PATH.unshift '/srv/whimsy/lib'
+
+#        DRAFT
+#        DRAFT
+#        DRAFT
+#        DRAFT
+#        DRAFT
+#        DRAFT
+
+# Test ICLA PDF parsing
+
+# Invoke as:
+# /secretary/icla-parse/yyyymm/hash/icla.pdf
+
+print "Content-type: text/plain; charset=UTF-8\r\n\r\n"
+
+pathinfo = ENV['PATH_INFO']
+iclaname = File.basename(pathinfo)
+puts "Processing #{pathinfo} to parse #{iclaname}"
+puts ""
+
+begin
+  require_relative 'workbench/models/mailbox'
+  require_relative 'iclaparser'
+  
+  message = Mailbox.find(pathinfo)
+  
+  path = message.find(iclaname).as_file.path
+  
+  ICLAParser.parse(path).sort_by{|k,v| k}.each do |k,v|
+    puts "%-20s %s" % [k,v]
+  end
+  
+rescue Exception => e
+  p e
+end
diff --git a/www/secretary/iclaparser.rb b/www/secretary/iclaparser.rb
new file mode 100644
index 0000000..3984cbd
--- /dev/null
+++ b/www/secretary/iclaparser.rb
@@ -0,0 +1,257 @@
+#!/usr/bin/env ruby
+
+#         DRAFT DRAFT DRAFT
+#         DRAFT DRAFT DRAFT
+#         DRAFT DRAFT DRAFT
+#         DRAFT DRAFT DRAFT
+#         DRAFT DRAFT DRAFT
+
+#
+# ICLA PDF parsing support
+#
+# Try to extract user text from ICLA PDFs.
+
+# The Gem is not 100% accurate in creating a text version of the page.
+# Also it's tricky to extract the text accurately.
+
+# So we try other methods first:
+# - if there is a form, return its fields
+# - if there are FreeText Annotations, return them in page order
+# - use show_text_with_positioning as that seems to be used for PDF updates
+# - where the PDF only uses show_text, the Gem is better at combining the data, so use page.txt
+
+require 'pdf-reader'
+
+# TODO perhaps always extract all the data types then choose the best
+# Should turn hash values into arrays?
+module ICLAParser
+  # Process page to extract text with positioning elements
+  # These are often used instead of providing form fields
+  class Receiver
+    SKIP = [
+      # Short elements that are not user data
+      'Individual Contributor',
+      'License Agreement',
+      '("Agreement") V2.0',
+      "as \"Not a Contribution.\"",
+      "inaccurate in any respect.",
+      "for your records.",
+      "1. Definitions.",
+      "Contributions and such derivative works.",
+      "litigation is filed.",
+      "Contributions."
+    ]
+
+    def initialize(fontdict)
+      @texts = [] # show_text_with_positioning
+      @lines = [] # show_text
+      @tfs = nil # text font and size
+      @fontdict = fontdict
+    end
+
+    # Some PDFs use show_text() multiple times in a line
+    def begin_text_object
+      @textobj = []
+    end
+
+    def end_text_object
+      @lines << @textobj.join('')
+    end
+
+    def set_text_font_and_size(*args)
+       @tfs=args
+    end
+  
+    def show_text(string)
+      font = @fontdict[@tfs.first]
+      utf8 = ICLAParser.string_to_utf8(string, font)
+      @textobj << utf8
+    end
+
+    def show_text_with_positioning(*args)
+        font = @fontdict[@tfs.first]
+        # args are Strings (in the current font encoding) interspersed with integer spacing
adjustments; only want the strings
+        # We assume the positioning does not overlay characters so can be ignored
+        chars = []
+        args.flatten.each do |arg|
+          if arg.is_a?(String)
+            char = ICLAParser.string_to_utf8(arg, font)
+            chars << char
+          end
+        end
+        val = chars.join("").strip
+        len = val.length
+        # some PDFs have the individual text in this format so skip long lines which are
unlikely to be user data
+        # Could perhaps have full list of expected text lines instead.
+        unless len == 0 or len > 50 or SKIP.include? val
+          @texts << val
+        end
+    end
+
+    def get_text
+      @texts
+    end
+    
+    def get_lines
+      @lines
+    end
+    
+  end
+
+  def self.string_to_utf8(string, font)
+    chars = []
+    glyphs = font.unpack(string)
+    glyphs.each do |glyph_code|
+      char = font.to_utf8(glyph_code)
+      # One pdf (yev) has spurious \t\r<sp>?<nbsp> translated from 36 => [9,
13, 32, 194, 160]
+      if glyph_code == 36 and char =~ /^\t\r /
+        char = ' '
+      end
+      chars << char
+    end
+    chars.join('')
+  end
+
+  # Standard form field names for other code to use
+  NAME2FIELD = {
+    'fullname' => :FullName,
+    'publicname' => :PublicName,
+    'mailingaddress' => :MailingAddress,
+    'mailingaddress2' => :MailingAddress2,
+    'postaladdress' => :MailingAddress,
+    'country' => :Country,
+    'telephone' => :Telephone,
+    'e-mail' => :EMail,
+    'preferredapacheid(s)' => :ApacheID,
+    'notifyproject' => :Project,
+    'date' => :Date,
+    'signature' => :Signature,
+  }
+
+  # canonicalise the names found in the PDF
+  def self.canon_field_name(pdfname)
+    NAME2FIELD[pdfname.gsub(' ','').downcase] || pdfname
+  end
+
+  # parse the PDF
+  def self.parse(path)
+    data=Hash.new
+    data[:dataSource] = {} # have we found anything
+    freetext = {} # gather the free text details
+    debug={}
+    begin
+      reader = PDF::Reader.new(path)
+      reader.objects.each do |k,v|
+        type = v[:Type] rescue nil
+        subtype = v[:Subtype] rescue nil
+        
+        if type == :Annot
+          if subtype == :FreeText # These are not directly associated with forms
+            rect = v[:Rect]
+            # rect can be a reference. If so, it seems there may be multiple copies with
different IDs but same Rect coords and contents
+            if rect.is_a?(PDF::Reader::Reference)
+              rect = reader.objects.deref(rect)
+            end
+            if rect.is_a?(Array)
+              contents = v[:Contents]
+              if contents and contents.length > 0 and contents != "\x14" # ignore "\x14"
== ASCII DC4
+                # Entries may be duplicated, so use a hash to store them
+                id = rect.inspect+contents # if the rect and contents match, then they overwrite
each other
+                freetext[id] = {Contents: contents.strip, x: rect[0], y: rect[1]} 
+                data[:dataSource]['FreeText'] = true
+              end
+            else
+              puts "warn: #{contents} Rect is #{rect.class} in #{path}"
+            end
+          else
+            key = v[:T]
+            if key
+              val = v[:V]
+              # This is a hack; should really find the font def and use that
+              if val
+                debug["#{key}"] = v.inspect
+                if val.bytes[0..1] == [254,255]
+                  val = val.encode('utf-8','utf-16').strip
+                else
+                  begin
+                    val = val.encode('utf-8').strip
+                  rescue Encoding::UndefinedConversionError
+                    val = val.encode('utf-8','iso-8859-1').strip
+                  end
+                end
+                val.gsub!("\x7F",'') # Not sure where these originate
+                if val.length > 0
+                  data[canon_field_name(key)] = val.gsub("\x7F",'') # Not sure where these
originate
+                end
+                data[:dataSource]['Form'] = true
+              end
+            end
+          end
+        else
+        end
+      end # objects
+      if freetext.size > 0
+        data[:text] = []
+        # Sort by Y descending (down the page) and X ascending (across)
+        # split into separate chunks if the difference in Y is more than a few points
+        how_close = 3
+        freetext.values. # no need for ids any more
+          sort_by{|e| -e[:y] }. # sort by Y desc
+          slice_when{|i,j| (i[:y]-j[:y]) > how_close}. # gather nearby Y values in case
there are multiple entries on a line
+          each do |k|
+            data[:text] << k.
+              sort_by{|l| l[:x]}. # sort by X ascending
+              map{|v| v[:Contents]}.join(", ")
+        end
+      end
+      if data[:dataSource].size == 0 or ((data[:text].size rescue 0) <= 1 and data.size
< 3) # No annotations found or not useful
+        page1 = nil # cache for page 1
+        fontdict = Hash.new
+        # Try looking for text sections instead
+        receiver = Receiver.new(fontdict)
+        reader.pages.each do |page|
+          # extract the fonts (needed for conversion to utf-8)
+          page.fonts.each do |label, font|
+            fontdict[label] ||= PDF::Reader::Font.new(page.objects, page.objects.deref(font))
+          end
+          page.walk(receiver)
+          page1 ||= page.text
+        end
+        # pickup up the collected strings
+        text = receiver.get_text()
+#        p text
+        lines = receiver.get_lines() # do we still need these?
+        debug[:lines] = lines
+        if text.length > 3
+          data[:dataSource]['Text'] = true
+          data[:text] = text
+        else
+          page1.each_line.slice_before(/^\s+Full name:/).each_with_index do |lump, i|
+            if i == 1 # starts with Full name
+              data[:dataSource]['Page'] = true
+              # drop the postamble
+              form = lump.slice_before(/^\S/).first
+              # split into headers
+              form.slice_before(/^\s+.+:/).each do |lines|
+                # trim leading and trailing blanks and underscores and drop blank lines
+                line = lines.map{|l| l.sub(/^[ _]+/,'').sub(/[ _]+$/,'')}.select{|l| l.length
> 0}.join(',')
+                case line
+                  when /^\s*(?:\(optional\) )?(.+):\s+(.*)/
+                    data[canon_field_name($1)] = $2
+                  else
+                    data[:unmatched] ||= []
+                    data[:unmatched] << line
+                end
+              end
+            end
+          end 
+        end
+      end
+    rescue Exception => e
+      data[:error]="Error processing #{path} => #{e.inspect} #{caller}"
+    end
+#    data[:debug] = debug
+    # TODO attempt to classify data[:text] items?
+    data
+  end
+end


Mime
View raw message