Feature #306 » 0001-implements-fulltext-extraction-for-attachments.patch
Gemfile | ||
---|---|---|
25 | 25 |
gem 'tzinfo-data', platforms: [:mingw, :x64_mingw, :mswin] |
26 | 26 |
gem "rbpdf", "~> 1.19.6" |
27 | 27 | |
28 |
gem 'plaintext' |
|
29 | ||
28 | 30 |
# Optional gem for LDAP authentication |
29 | 31 |
group :ldap do |
30 | 32 |
gem "net-ldap", "~> 0.16.0" |
config/configuration.yml.example | ||
---|---|---|
209 | 209 |
# allowed values: :memory, :file, :memcache |
210 | 210 |
#openid_authentication_store: :memory |
211 | 211 | |
212 |
# Text extraction helper programs. |
|
213 |
# |
|
214 |
# commands should write the resulting plain text to STDOUT. Use __FILE__ as |
|
215 |
# placeholder for the file path. The values below are the defaults. |
|
216 |
# |
|
217 |
# To disable a certain extractor without having to remove it from your |
|
218 |
# system, set it's command to a non-existant binary, i.e: |
|
219 |
# |
|
220 |
# pdftotext: |
|
221 |
# - /usr/bin/pdftotext_disabled |
|
222 |
# |
|
223 |
text_extractors: |
|
224 |
# apt install poppler-utils |
|
225 |
# pdftotext: |
|
226 |
# - /usr/bin/pdftotext |
|
227 |
# - -enc |
|
228 |
# - UTF-8 |
|
229 |
# - __FILE__ |
|
230 |
# - '-' |
|
231 | ||
232 |
# apt install unrtf |
|
233 |
# unrtf: |
|
234 |
# - /usr/bin/unrtf |
|
235 |
# - --text |
|
236 |
# - __FILE__ |
|
237 | ||
238 |
# apt install catdoc |
|
239 |
# catdoc: |
|
240 |
# - /usr/bin/catdoc |
|
241 |
# - -dutf-8 |
|
242 |
# - __FILE__ |
|
243 |
# xls2csv: |
|
244 |
# - /usr/bin/xls2csv |
|
245 |
# - -dutf-8 |
|
246 |
# - __FILE__ |
|
247 |
# catppt: |
|
248 |
# - /usr/bin/catppt |
|
249 |
# - -dutf-8 |
|
250 |
# - __FILE__ |
|
251 | ||
252 |
# apt-get install tesseract-ocr |
|
253 |
# tesseract: |
|
254 |
# - /usr/bin/tesseract |
|
255 |
# - -dutf-8 |
|
256 |
# - __FILE__ |
|
257 | ||
212 | 258 |
# specific configuration options for production environment |
213 | 259 |
# that overrides the default ones |
214 | 260 |
production: |
lib/redmine/configuration.rb | ||
---|---|---|
65 | 65 |
end |
66 | 66 |
end |
67 | 67 | |
68 |
if text_extractors = @config['text_extractors'] |
|
69 |
Plaintext::Configuration.load YAML.dump text_extractors |
|
70 |
end |
|
71 | ||
68 | 72 |
check_regular_expressions |
69 | 73 |
@config |
70 | 74 |
end |
lib/redmine/text_extractor.rb | ||
---|---|---|
1 |
module Redmine |
|
2 |
class TextExtractor |
|
3 | ||
4 |
def initialize(attachment) |
|
5 |
@attachment = attachment |
|
6 |
end |
|
7 | ||
8 |
# returns the extracted fulltext or nil if no matching handler was found |
|
9 |
# for the file type. |
|
10 |
def text |
|
11 |
Plaintext::Resolver.new(@attachment.diskfile, |
|
12 |
@attachment.content_type).text |
|
13 |
rescue Exception => e |
|
14 |
Rails.logger.error "error in fulltext extraction: #{e}" |
|
15 |
raise e unless e.is_a? StandardError # re-raise Signals / SyntaxErrors etc |
|
16 |
end |
|
17 | ||
18 |
end |
|
19 |
end |
|
20 |
test/unit/lib/redmine/text_extractor_test.rb | ||
---|---|---|
1 |
require_relative '../../../test_helper' |
|
2 | ||
3 |
class Redmine::TextExtractorTest < ActiveSupport::TestCase |
|
4 |
fixtures :projects, :users, :attachments |
|
5 | ||
6 |
setup do |
|
7 |
@project = Project.find_by_identifier 'ecookbook' |
|
8 |
set_fixtures_attachments_directory |
|
9 |
@dlopper = User.find_by_login 'dlopper' |
|
10 |
end |
|
11 | ||
12 |
def attachment_for(filename, content_type = nil) |
|
13 |
Attachment.new(container: @project, |
|
14 |
file: uploaded_test_file(filename, content_type), |
|
15 |
filename: filename, |
|
16 |
author: @dlopper).tap do |a| |
|
17 |
a.content_type = content_type if content_type |
|
18 |
a.save! |
|
19 |
end |
|
20 |
end |
|
21 | ||
22 |
test "should extract text from text file" do |
|
23 |
a = attachment_for "testfile.txt" |
|
24 |
te = Redmine::TextExtractor.new a |
|
25 |
assert text = te.text |
|
26 |
assert_match /this is a text file for upload tests with multiple lines/, text |
|
27 |
end |
|
28 | ||
29 |
test "should extract text from csv" do |
|
30 |
a = attachment_for "import_dates.csv" |
|
31 |
te = Redmine::TextExtractor.new a |
|
32 |
assert text = te.text |
|
33 |
assert_match /Invalid start date/, text |
|
34 |
end |
|
35 | ||
36 |
end |
|
37 |