null+****@clear*****
null+****@clear*****
Mon Jan 30 16:04:08 JST 2012
kdmsnr 2012-01-30 16:04:08 +0900 (Mon, 30 Jan 2012) New Revision: de372716aeebd30129c20e2795e1f9e160202cc7 Merged 76ab299: Merge pull request #26 from kdmsnr/mozilla_glossaries Log: add mozilla_japan external glossary Added files: lib/logaling/external_glossaries/mozilla_japan.rb Added: lib/logaling/external_glossaries/mozilla_japan.rb (+47 -0) 100644 =================================================================== --- /dev/null +++ lib/logaling/external_glossaries/mozilla_japan.rb 2012-01-30 16:04:08 +0900 (b795515) @@ -0,0 +1,47 @@ +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +require 'open-uri' +require 'nokogiri' + +module Logaling + class MozillaJapan < ExternalGlossary + description 'Mozilla Japan (http://www.mozilla-japan.org/jp/l10n/term/l10n.html)' + source_language 'en' + target_language 'ja' + output_format 'csv' + + private + def convert_to_csv(csv) + url = 'http://www.mozilla-japan.org/jp/l10n/term/l10n.html' + doc = ::Nokogiri::HTML(open(url, "r")) + doc.encoding = "UTF-8" + doc.search(".obsolete").remove + doc.search(".relate").remove + doc.css("dl[@class='terminology en-ja']").each do |dl| + dl.children.each_slice(2) do |dt, dd| + dd.text.split("|").each do |ddt| + ddt = ddt.gsub(/\s/, '') + unless ddt.empty? + csv << [dust_to_tilda(dt.text), dust_to_tilda(ddt)] + end + end + end + end + end + + def dust_to_tilda(txt) + txt.gsub("\xEF\xBF\xBD", "\xE3\x80\x9C") + end + end +end