[Groonga-commit] ranguba/chupa-text at 7b5acfe [master] Add support docx

Back to archive index
Kouhei Sutou null+****@clear*****
Mon Feb 25 15:37:09 JST 2019


Kouhei Sutou	2019-02-25 15:37:09 +0900 (Mon, 25 Feb 2019)

  Revision: 7b5acfe9dbb7b8a0be3e61d5dcc61d62caa0a14d
  https://github.com/ranguba/chupa-text/commit/7b5acfe9dbb7b8a0be3e61d5dcc61d62caa0a14d

  Message:
    Add support docx

  Added files:
    lib/chupa-text/decomposers/office-open-xml-document.rb
    lib/chupa-text/decomposers/office-open-xml.rb
    test/decomposers/test-office-open-xml-document.rb
    test/fixture/docx/attributes.docx
    test/fixture/docx/multi-pages.docx
    test/fixture/docx/one-page.docx

  Added: lib/chupa-text/decomposers/office-open-xml-document.rb (+33 -0) 100644
===================================================================
--- /dev/null
+++ lib/chupa-text/decomposers/office-open-xml-document.rb    2019-02-25 15:37:09 +0900 (31266ba)
@@ -0,0 +1,33 @@
+# Copyright (C) 2019  Kouhei Sutou <kou****@clear*****>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+
+require "chupa-text/decomposers/office-open-xml"
+
+module ChupaText
+  module Decomposers
+    class OfficeOpenXMLDocument < OfficeOpenXML
+      registry.register("office-open-xml-document", self)
+
+      def initialize(options={})
+        super
+        @extension = "docx"
+        @mime_type = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+        @path = "word/document.xml"
+        @namespace_uri = "http://schemas.openxmlformats.org/wordprocessingml/2006/main"
+      end
+    end
+  end
+end

  Added: lib/chupa-text/decomposers/office-open-xml.rb (+176 -0) 100644
===================================================================
--- /dev/null
+++ lib/chupa-text/decomposers/office-open-xml.rb    2019-02-25 15:37:09 +0900 (3bcfb21)
@@ -0,0 +1,176 @@
+# Copyright (C) 2019  Kouhei Sutou <kou****@clear*****>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+
+require "rexml/parsers/sax2parser"
+require "rexml/sax2listener"
+
+require "archive/zip"
+
+module ChupaText
+  module Decomposers
+    class OfficeOpenXML < Decomposer
+      def target?(data)
+        data.extension == @extension or data.mime_type == @mime_type
+      end
+
+      def target_score(data)
+        if target?(data)
+          -1
+        else
+          nil
+        end
+      end
+
+      def decompose(data)
+        text = nil
+        attributes = {}
+        data.open do |input|
+          Archive::Zip.open(input) do |zip|
+            zip.each do |entry|
+              next unless entry.file?
+              case entry.zip_path
+              when @path
+                text = ""
+                listener = TextListener.new(text, @namespace_uri)
+                parse(entry.file_data, listener)
+              when "docProps/app.xml"
+                listener = AttributesListener.new(attributes)
+                parse(entry.file_data, listener)
+              when "docProps/core.xml"
+                listener = AttributesListener.new(attributes)
+                parse(entry.file_data, listener)
+              end
+            end
+          end
+        end
+        text_data = TextData.new(text, source_data: data)
+        attributes.each do |name, value|
+          text_data[name] = value
+        end
+        yield(text_data)
+      end
+
+      private
+      def parse(io, listener)
+        source = REXML::IOSource.new(io)
+        parser = REXML::Parsers::SAX2Parser.new(source)
+        parser.listen(listener)
+        parser.parse
+      end
+
+      class TextListener
+        include REXML::SAX2Listener
+
+        def initialize(output, target_uri)
+          @output = output
+          @target_uri = target_uri
+          @in_target = false
+        end
+
+        def start_element(uri, local_name, qname, attributes)
+          return unless uri == @target_uri
+          case local_name
+          when "t"
+            @in_target = true
+          end
+        end
+
+        def end_element(uri, local_name, qname)
+          return unless uri == @target_uri
+          case local_name
+          when "t"
+            @in_target = false
+          when "p", "br"
+            @output << "\n"
+          end
+        end
+
+        def characters(text)
+          @output << text if @in_target
+        end
+
+        def cdata(content)
+          @output << content if @in_target
+        end
+      end
+
+      class AttributesListener
+        include REXML::SAX2Listener
+
+        CORE_PROPERTIES_URI =
+          "http://schemas.openxmlformats.org/package/2006/metadata/core-properties"
+        EXTENDED_PROPERTIES_URI =
+          "http://schemas.openxmlformats.org/officeDocument/2006/extended-properties"
+        DUBLIN_CORE_URI = "http://purl.org/dc/elements/1.1/"
+        DUBLIN_CORE_TERMS_URI = "http://purl.org/dc/terms/"
+
+        def initialize(attributes)
+          @attributes = attributes
+          @name = nil
+          @type = nil
+        end
+
+        def start_element(uri, local_name, qname, attributes)
+          case uri
+          when CORE_PROPERTIES_URI
+            case local_name
+            when "keywords"
+              @name = local_name
+            end
+          when EXTENDED_PROPERTIES_URI
+            case local_name
+            when "Application"
+              @name = local_name.downcase
+            end
+          when DUBLIN_CORE_URI
+            case local_name
+            when "description", "title", "subject"
+              @name = local_name
+            end
+          when DUBLIN_CORE_TERMS_URI
+            case local_name
+            when "created", "modified"
+              @name = "#{local_name}_time"
+              @type = :w3cdtf
+            end
+          end
+        end
+
+        def end_element(uri, local_name, qname)
+          @name = nil
+          @type = nil
+        end
+
+        def characters(text)
+          set_attribute(text)
+        end
+
+        def cdata(content)
+          set_attribute(content)
+        end
+
+        def set_attribute(value)
+          return if****@name*****?
+          case @type
+          when :w3cdtf
+            value = Time.xmlschema(value)
+          end
+          @attributes[@name] = value
+        end
+      end
+    end
+  end
+end

  Added: test/decomposers/test-office-open-xml-document.rb (+129 -0) 100644
===================================================================
--- /dev/null
+++ test/decomposers/test-office-open-xml-document.rb    2019-02-25 15:37:09 +0900 (61fa8ea)
@@ -0,0 +1,129 @@
+# Copyright (C) 2019  Kouhei Sutou <kou****@clear*****>
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
+
+class TestDecomposersOfficeOpenXMLDocument < Test::Unit::TestCase
+  include Helper
+
+  def setup
+    @decomposer = ChupaText::Decomposers::OfficeOpenXMLDocument.new({})
+  end
+
+  def decompose(path)
+    data = ChupaText::InputData.new(path)
+    decomposed = []
+    @decomposer.decompose(data) do |decomposed_data|
+      decomposed << decomposed_data
+    end
+    decomposed
+  end
+
+  sub_test_case("#target_score") do
+    def test_extension
+      data = ChupaText::Data.new
+      data.body = ""
+      data.uri = "document.docx"
+      assert_equal(-1, @decomposer.target_score(data))
+    end
+
+    def test_mime_type
+      data = ChupaText::Data.new
+      data.mime_type = "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
+      assert_equal(-1, @decomposer.target_score(data))
+    end
+  end
+
+  sub_test_case("#decompose") do
+    sub_test_case("attributes") do
+      def decompose(attribute_name)
+        super(fixture_path("docx", "attributes.docx")).collect do |data|
+          data[attribute_name]
+        end
+      end
+
+      def test_title
+        assert_equal(["Title"], decompose("title"))
+      end
+
+      def test_author
+        assert_equal([nil], decompose("author"))
+      end
+
+      def test_subject
+        assert_equal(["Subject"], decompose("subject"))
+      end
+
+      def test_keywords
+        assert_equal(["Keyword1 Keyword2"], decompose("keywords"))
+      end
+
+      def test_created_time
+        assert_equal([Time],
+                     decompose("created_time").collect(&:class))
+      end
+
+      def test_modified_time
+        assert_equal([Time],
+                     decompose("modified_time").collect(&:class))
+      end
+
+      def test_application
+        assert_equal(["LibreOffice"],
+                     normalize_applications(decompose("application")))
+      end
+
+      def normalize_applications(applications)
+        applications.collect do |application|
+          normalize_application(application)
+        end
+      end
+
+      def normalize_application(application)
+        if application.start_with?("LibreOffice")
+          "LibreOffice"
+        else
+          application
+        end
+      end
+
+      def test_creation_date
+        assert_equal([nil], decompose("creation_date"))
+      end
+    end
+
+    sub_test_case("one page") do
+      def decompose
+        super(fixture_path("docx", "one-page.docx"))
+      end
+
+      def test_body
+        assert_equal(["Page1\n"], decompose.collect(&:body))
+      end
+    end
+
+    sub_test_case("multi pages") do
+      def decompose
+        super(fixture_path("docx", "multi-pages.docx"))
+      end
+
+      def test_body
+        assert_equal([<<-BODY], decompose.collect(&:body))
+Page1
+Page2
+        BODY
+      end
+    end
+  end
+end

  Added: test/fixture/docx/attributes.docx (+23 -0) 100644
===================================================================
--- /dev/null
+++ test/fixture/docx/attributes.docx    2019-02-25 15:37:09 +0900 (601c30d)
@@ -0,0 +1,23 @@
+MIME-Version: 1.0
+mime-type: application/vnd.openxmlformats-officedocument.wordprocessingml.document
+uri: file:/tmp/FNZQyH_attributes.docx
+path: /tmp/FNZQyH_attributes.docx
+size: 3937
+Content-Type: multipart/mixed; boundary=a7e0c2515b16236373657441b64cd1810c423456
+
+--a7e0c2515b16236373657441b64cd1810c423456
+mime-type: text/plain
+uri: file:/tmp/FNZQyH_attributes.txt
+path: /tmp/FNZQyH_attributes.txt
+size: 6
+title: Title
+created_time: 2019-02-25 06:37:26 UTC
+source-mime-types: ["application/pdf", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"]
+subject: Subject
+keywords: Keyword1, Keyword2
+creator: Writer
+producer: LibreOffice 5.2
+
+Page1
+
+--a7e0c2515b16236373657441b64cd1810c423456--

  Added: test/fixture/docx/multi-pages.docx (+21 -0) 100644
===================================================================
--- /dev/null
+++ test/fixture/docx/multi-pages.docx    2019-02-25 15:37:09 +0900 (14b17ff)
@@ -0,0 +1,21 @@
+MIME-Version: 1.0
+mime-type: application/vnd.openxmlformats-officedocument.wordprocessingml.document
+uri: file:/tmp/5vyOOm_multi-pages.docx
+path: /tmp/5vyOOm_multi-pages.docx
+size: 3889
+Content-Type: multipart/mixed; boundary=3115eb4345fdefc6080026ff9050b01c7b45a7ed
+
+--3115eb4345fdefc6080026ff9050b01c7b45a7ed
+mime-type: text/plain
+uri: file:/tmp/5vyOOm_multi-pages.txt
+path: /tmp/5vyOOm_multi-pages.txt
+size: 12
+created_time: 2019-02-25 06:37:27 UTC
+source-mime-types: ["application/pdf", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"]
+creator: Writer
+producer: LibreOffice 5.2
+
+Page1
+Page2
+
+--3115eb4345fdefc6080026ff9050b01c7b45a7ed--

  Added: test/fixture/docx/one-page.docx (+20 -0) 100644
===================================================================
--- /dev/null
+++ test/fixture/docx/one-page.docx    2019-02-25 15:37:09 +0900 (0b90a5e)
@@ -0,0 +1,20 @@
+MIME-Version: 1.0
+mime-type: application/vnd.openxmlformats-officedocument.wordprocessingml.document
+uri: file:/tmp/Gj8c67_one-page.docx
+path: /tmp/Gj8c67_one-page.docx
+size: 3871
+Content-Type: multipart/mixed; boundary=25ae687f54c50715bc6d276533ca98516cf4573d
+
+--25ae687f54c50715bc6d276533ca98516cf4573d
+mime-type: text/plain
+uri: file:/tmp/Gj8c67_one-page.txt
+path: /tmp/Gj8c67_one-page.txt
+size: 6
+created_time: 2019-02-25 06:37:29 UTC
+source-mime-types: ["application/pdf", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"]
+creator: Writer
+producer: LibreOffice 5.2
+
+Page1
+
+--25ae687f54c50715bc6d276533ca98516cf4573d--
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20190225/dc06ab8d/attachment-0001.html>


More information about the Groonga-commit mailing list
Back to archive index