Kouhei Sutou 2019-02-25 15:37:09 +0900 (Mon, 25 Feb 2019) Revision: 7b5acfe9dbb7b8a0be3e61d5dcc61d62caa0a14d https://github.com/ranguba/chupa-text/commit/7b5acfe9dbb7b8a0be3e61d5dcc61d62caa0a14d Message: Add support docx Added files: lib/chupa-text/decomposers/office-open-xml-document.rb lib/chupa-text/decomposers/office-open-xml.rb test/decomposers/test-office-open-xml-document.rb test/fixture/docx/attributes.docx test/fixture/docx/multi-pages.docx test/fixture/docx/one-page.docx Added: lib/chupa-text/decomposers/office-open-xml-document.rb (+33 -0) 100644 =================================================================== --- /dev/null +++ lib/chupa-text/decomposers/office-open-xml-document.rb 2019-02-25 15:37:09 +0900 (31266ba) @@ -0,0 +1,33 @@ +# Copyright (C) 2019 Kouhei Sutou <kou****@clear*****> +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +require "chupa-text/decomposers/office-open-xml" + +module ChupaText + module Decomposers + class OfficeOpenXMLDocument < OfficeOpenXML + registry.register("office-open-xml-document", self) + + def initialize(options={}) + super + @extension = "docx" + @mime_type = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + @path = "word/document.xml" + @namespace_uri = "http://schemas.openxmlformats.org/wordprocessingml/2006/main" + end + end + end +end Added: lib/chupa-text/decomposers/office-open-xml.rb (+176 -0) 100644 =================================================================== --- /dev/null +++ lib/chupa-text/decomposers/office-open-xml.rb 2019-02-25 15:37:09 +0900 (3bcfb21) @@ -0,0 +1,176 @@ +# Copyright (C) 2019 Kouhei Sutou <kou****@clear*****> +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +require "rexml/parsers/sax2parser" +require "rexml/sax2listener" + +require "archive/zip" + +module ChupaText + module Decomposers + class OfficeOpenXML < Decomposer + def target?(data) + data.extension == @extension or data.mime_type == @mime_type + end + + def target_score(data) + if target?(data) + -1 + else + nil + end + end + + def decompose(data) + text = nil + attributes = {} + data.open do |input| + Archive::Zip.open(input) do |zip| + zip.each do |entry| + next unless entry.file? + case entry.zip_path + when @path + text = "" + listener = TextListener.new(text, @namespace_uri) + parse(entry.file_data, listener) + when "docProps/app.xml" + listener = AttributesListener.new(attributes) + parse(entry.file_data, listener) + when "docProps/core.xml" + listener = AttributesListener.new(attributes) + parse(entry.file_data, listener) + end + end + end + end + text_data = TextData.new(text, source_data: data) + attributes.each do |name, value| + text_data[name] = value + end + yield(text_data) + end + + private + def parse(io, listener) + source = REXML::IOSource.new(io) + parser = REXML::Parsers::SAX2Parser.new(source) + parser.listen(listener) + parser.parse + end + + class TextListener + include REXML::SAX2Listener + + def initialize(output, target_uri) + @output = output + @target_uri = target_uri + @in_target = false + end + + def start_element(uri, local_name, qname, attributes) + return unless uri == @target_uri + case local_name + when "t" + @in_target = true + end + end + + def end_element(uri, local_name, qname) + return unless uri == @target_uri + case local_name + when "t" + @in_target = false + when "p", "br" + @output << "\n" + end + end + + def characters(text) + @output << text if @in_target + end + + def cdata(content) + @output << content if @in_target + end + end + + class AttributesListener + include REXML::SAX2Listener + + CORE_PROPERTIES_URI = + "http://schemas.openxmlformats.org/package/2006/metadata/core-properties" + EXTENDED_PROPERTIES_URI = + "http://schemas.openxmlformats.org/officeDocument/2006/extended-properties" + DUBLIN_CORE_URI = "http://purl.org/dc/elements/1.1/" + DUBLIN_CORE_TERMS_URI = "http://purl.org/dc/terms/" + + def initialize(attributes) + @attributes = attributes + @name = nil + @type = nil + end + + def start_element(uri, local_name, qname, attributes) + case uri + when CORE_PROPERTIES_URI + case local_name + when "keywords" + @name = local_name + end + when EXTENDED_PROPERTIES_URI + case local_name + when "Application" + @name = local_name.downcase + end + when DUBLIN_CORE_URI + case local_name + when "description", "title", "subject" + @name = local_name + end + when DUBLIN_CORE_TERMS_URI + case local_name + when "created", "modified" + @name = "#{local_name}_time" + @type = :w3cdtf + end + end + end + + def end_element(uri, local_name, qname) + @name = nil + @type = nil + end + + def characters(text) + set_attribute(text) + end + + def cdata(content) + set_attribute(content) + end + + def set_attribute(value) + return if****@name*****? + case @type + when :w3cdtf + value = Time.xmlschema(value) + end + @attributes[@name] = value + end + end + end + end +end Added: test/decomposers/test-office-open-xml-document.rb (+129 -0) 100644 =================================================================== --- /dev/null +++ test/decomposers/test-office-open-xml-document.rb 2019-02-25 15:37:09 +0900 (61fa8ea) @@ -0,0 +1,129 @@ +# Copyright (C) 2019 Kouhei Sutou <kou****@clear*****> +# +# This library is free software; you can redistribute it and/or +# modify it under the terms of the GNU Lesser General Public +# License as published by the Free Software Foundation; either +# version 2.1 of the License, or (at your option) any later version. +# +# This library is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# Lesser General Public License for more details. +# +# You should have received a copy of the GNU Lesser General Public +# License along with this library; if not, write to the Free Software +# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA + +class TestDecomposersOfficeOpenXMLDocument < Test::Unit::TestCase + include Helper + + def setup + @decomposer = ChupaText::Decomposers::OfficeOpenXMLDocument.new({}) + end + + def decompose(path) + data = ChupaText::InputData.new(path) + decomposed = [] + @decomposer.decompose(data) do |decomposed_data| + decomposed << decomposed_data + end + decomposed + end + + sub_test_case("#target_score") do + def test_extension + data = ChupaText::Data.new + data.body = "" + data.uri = "document.docx" + assert_equal(-1, @decomposer.target_score(data)) + end + + def test_mime_type + data = ChupaText::Data.new + data.mime_type = "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + assert_equal(-1, @decomposer.target_score(data)) + end + end + + sub_test_case("#decompose") do + sub_test_case("attributes") do + def decompose(attribute_name) + super(fixture_path("docx", "attributes.docx")).collect do |data| + data[attribute_name] + end + end + + def test_title + assert_equal(["Title"], decompose("title")) + end + + def test_author + assert_equal([nil], decompose("author")) + end + + def test_subject + assert_equal(["Subject"], decompose("subject")) + end + + def test_keywords + assert_equal(["Keyword1 Keyword2"], decompose("keywords")) + end + + def test_created_time + assert_equal([Time], + decompose("created_time").collect(&:class)) + end + + def test_modified_time + assert_equal([Time], + decompose("modified_time").collect(&:class)) + end + + def test_application + assert_equal(["LibreOffice"], + normalize_applications(decompose("application"))) + end + + def normalize_applications(applications) + applications.collect do |application| + normalize_application(application) + end + end + + def normalize_application(application) + if application.start_with?("LibreOffice") + "LibreOffice" + else + application + end + end + + def test_creation_date + assert_equal([nil], decompose("creation_date")) + end + end + + sub_test_case("one page") do + def decompose + super(fixture_path("docx", "one-page.docx")) + end + + def test_body + assert_equal(["Page1\n"], decompose.collect(&:body)) + end + end + + sub_test_case("multi pages") do + def decompose + super(fixture_path("docx", "multi-pages.docx")) + end + + def test_body + assert_equal([<<-BODY], decompose.collect(&:body)) +Page1 +Page2 + BODY + end + end + end +end Added: test/fixture/docx/attributes.docx (+23 -0) 100644 =================================================================== --- /dev/null +++ test/fixture/docx/attributes.docx 2019-02-25 15:37:09 +0900 (601c30d) @@ -0,0 +1,23 @@ +MIME-Version: 1.0 +mime-type: application/vnd.openxmlformats-officedocument.wordprocessingml.document +uri: file:/tmp/FNZQyH_attributes.docx +path: /tmp/FNZQyH_attributes.docx +size: 3937 +Content-Type: multipart/mixed; boundary=a7e0c2515b16236373657441b64cd1810c423456 + +--a7e0c2515b16236373657441b64cd1810c423456 +mime-type: text/plain +uri: file:/tmp/FNZQyH_attributes.txt +path: /tmp/FNZQyH_attributes.txt +size: 6 +title: Title +created_time: 2019-02-25 06:37:26 UTC +source-mime-types: ["application/pdf", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"] +subject: Subject +keywords: Keyword1, Keyword2 +creator: Writer +producer: LibreOffice 5.2 + +Page1 + +--a7e0c2515b16236373657441b64cd1810c423456-- Added: test/fixture/docx/multi-pages.docx (+21 -0) 100644 =================================================================== --- /dev/null +++ test/fixture/docx/multi-pages.docx 2019-02-25 15:37:09 +0900 (14b17ff) @@ -0,0 +1,21 @@ +MIME-Version: 1.0 +mime-type: application/vnd.openxmlformats-officedocument.wordprocessingml.document +uri: file:/tmp/5vyOOm_multi-pages.docx +path: /tmp/5vyOOm_multi-pages.docx +size: 3889 +Content-Type: multipart/mixed; boundary=3115eb4345fdefc6080026ff9050b01c7b45a7ed + +--3115eb4345fdefc6080026ff9050b01c7b45a7ed +mime-type: text/plain +uri: file:/tmp/5vyOOm_multi-pages.txt +path: /tmp/5vyOOm_multi-pages.txt +size: 12 +created_time: 2019-02-25 06:37:27 UTC +source-mime-types: ["application/pdf", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"] +creator: Writer +producer: LibreOffice 5.2 + +Page1 +Page2 + +--3115eb4345fdefc6080026ff9050b01c7b45a7ed-- Added: test/fixture/docx/one-page.docx (+20 -0) 100644 =================================================================== --- /dev/null +++ test/fixture/docx/one-page.docx 2019-02-25 15:37:09 +0900 (0b90a5e) @@ -0,0 +1,20 @@ +MIME-Version: 1.0 +mime-type: application/vnd.openxmlformats-officedocument.wordprocessingml.document +uri: file:/tmp/Gj8c67_one-page.docx +path: /tmp/Gj8c67_one-page.docx +size: 3871 +Content-Type: multipart/mixed; boundary=25ae687f54c50715bc6d276533ca98516cf4573d + +--25ae687f54c50715bc6d276533ca98516cf4573d +mime-type: text/plain +uri: file:/tmp/Gj8c67_one-page.txt +path: /tmp/Gj8c67_one-page.txt +size: 6 +created_time: 2019-02-25 06:37:29 UTC +source-mime-types: ["application/pdf", "application/vnd.openxmlformats-officedocument.wordprocessingml.document"] +creator: Writer +producer: LibreOffice 5.2 + +Page1 + +--25ae687f54c50715bc6d276533ca98516cf4573d-- -------------- next part -------------- An HTML attachment was scrubbed... URL: <https://lists.osdn.me/mailman/archives/groonga-commit/attachments/20190225/dc06ab8d/attachment-0001.html>