From b1bc44a2f229da13b15cdbe0afecc782f9961465 Mon Sep 17 00:00:00 2001 From: Jens Kraemer Date: Wed, 14 Jun 2017 09:53:07 +0800 Subject: [PATCH 1/5] implements fulltext extraction for attachments - by means of the plaintext gem --- Gemfile | 2 ++ config/configuration.yml.example | 46 ++++++++++++++++++++++++++++ lib/redmine/configuration.rb | 4 +++ lib/redmine/text_extractor.rb | 20 ++++++++++++ test/unit/lib/redmine/text_extractor_test.rb | 37 ++++++++++++++++++++++ 5 files changed, 109 insertions(+) create mode 100644 lib/redmine/text_extractor.rb create mode 100644 test/unit/lib/redmine/text_extractor_test.rb diff --git a/Gemfile b/Gemfile index b9a176439..a8d1896da 100644 --- a/Gemfile +++ b/Gemfile @@ -25,6 +25,8 @@ gem "rails-html-sanitizer", ">= 1.0.3" gem 'tzinfo-data', platforms: [:mingw, :x64_mingw, :mswin] gem "rbpdf", "~> 1.19.6" +gem 'plaintext' + # Optional gem for LDAP authentication group :ldap do gem "net-ldap", "~> 0.16.0" diff --git a/config/configuration.yml.example b/config/configuration.yml.example index bff4c9740..18981a4aa 100644 --- a/config/configuration.yml.example +++ b/config/configuration.yml.example @@ -209,6 +209,52 @@ default: # allowed values: :memory, :file, :memcache #openid_authentication_store: :memory + # Text extraction helper programs. + # + # commands should write the resulting plain text to STDOUT. Use __FILE__ as + # placeholder for the file path. The values below are the defaults. + # + # To disable a certain extractor without having to remove it from your + # system, set it's command to a non-existant binary, i.e: + # + # pdftotext: + # - /usr/bin/pdftotext_disabled + # + text_extractors: + # apt install poppler-utils + # pdftotext: + # - /usr/bin/pdftotext + # - -enc + # - UTF-8 + # - __FILE__ + # - '-' + + # apt install unrtf + # unrtf: + # - /usr/bin/unrtf + # - --text + # - __FILE__ + + # apt install catdoc + # catdoc: + # - /usr/bin/catdoc + # - -dutf-8 + # - __FILE__ + # xls2csv: + # - /usr/bin/xls2csv + # - -dutf-8 + # - __FILE__ + # catppt: + # - /usr/bin/catppt + # - -dutf-8 + # - __FILE__ + + # apt-get install tesseract-ocr + # tesseract: + # - /usr/bin/tesseract + # - -dutf-8 + # - __FILE__ + # specific configuration options for production environment # that overrides the default ones production: diff --git a/lib/redmine/configuration.rb b/lib/redmine/configuration.rb index 6ca65304c..2c54529bb 100644 --- a/lib/redmine/configuration.rb +++ b/lib/redmine/configuration.rb @@ -65,6 +65,10 @@ module Redmine end end + if text_extractors = @config['text_extractors'] + Plaintext::Configuration.load YAML.dump text_extractors + end + check_regular_expressions @config end diff --git a/lib/redmine/text_extractor.rb b/lib/redmine/text_extractor.rb new file mode 100644 index 000000000..8d2f9e69c --- /dev/null +++ b/lib/redmine/text_extractor.rb @@ -0,0 +1,20 @@ +module Redmine + class TextExtractor + + def initialize(attachment) + @attachment = attachment + end + + # returns the extracted fulltext or nil if no matching handler was found + # for the file type. + def text + Plaintext::Resolver.new(@attachment.diskfile, + @attachment.content_type).text + rescue Exception => e + Rails.logger.error "error in fulltext extraction: #{e}" + raise e unless e.is_a? StandardError # re-raise Signals / SyntaxErrors etc + end + + end +end + diff --git a/test/unit/lib/redmine/text_extractor_test.rb b/test/unit/lib/redmine/text_extractor_test.rb new file mode 100644 index 000000000..8411d2ee3 --- /dev/null +++ b/test/unit/lib/redmine/text_extractor_test.rb @@ -0,0 +1,37 @@ +require_relative '../../../test_helper' + +class Redmine::TextExtractorTest < ActiveSupport::TestCase + fixtures :projects, :users, :attachments + + setup do + @project = Project.find_by_identifier 'ecookbook' + set_fixtures_attachments_directory + @dlopper = User.find_by_login 'dlopper' + end + + def attachment_for(filename, content_type = nil) + Attachment.new(container: @project, + file: uploaded_test_file(filename, content_type), + filename: filename, + author: @dlopper).tap do |a| + a.content_type = content_type if content_type + a.save! + end + end + + test "should extract text from text file" do + a = attachment_for "testfile.txt" + te = Redmine::TextExtractor.new a + assert text = te.text + assert_match /this is a text file for upload tests with multiple lines/, text + end + + test "should extract text from csv" do + a = attachment_for "import_dates.csv" + te = Redmine::TextExtractor.new a + assert text = te.text + assert_match /Invalid start date/, text + end + +end + -- 2.11.0