Project

General

Profile

Feature #306 » 0002-implements-fulltext-extraction-for-attachments.patch

Jens Krämer, 2017-06-21 11:15

View differences:

app/controllers/admin_controller.rb
79 79
      [:text_file_repository_writable, File.writable?(Attachment.storage_path)],
80 80
      ["#{l :text_plugin_assets_writable} (./public/plugin_assets)",   File.writable?(Redmine::Plugin.public_directory)],
81 81
      [:text_rmagick_available,        Object.const_defined?(:Magick)],
82
      [:text_convert_available,        Redmine::Thumbnail.convert_available?]
82
      [:text_convert_available,        Redmine::Thumbnail.convert_available?],
83
      [:text_pdftotext_available,      Redmine::TextExtractor::PdfHandler.available?],
84
      [:text_unrtf_available,          Redmine::TextExtractor::RtfHandler.available?],
85
      [:text_catdoc_available,         Redmine::TextExtractor::DocHandler.available?],
86
      [:text_xls2csv_available,        Redmine::TextExtractor::XlsHandler.available?],
87
      [:text_catppt_available,         Redmine::TextExtractor::PptHandler.available?]
83 88
    ]
84 89
  end
85 90
end
config/configuration.yml.example
212 212
  # allowed values: :memory, :file, :memcache
213 213
  #openid_authentication_store: :memory
214 214

  
215
  # Text extraction helper programs.
216
  #
217
  # commands should write the resulting plain text to STDOUT. Use __FILE__ as
218
  # placeholder for the file path. The values below are the defaults.
219
  text_extractors:
220
    # apt install poppler-utils
221
    # pdftotext:
222
    #   - /usr/bin/pdftotext
223
    #   - -enc
224
    #   - UTF-8
225
    #   - __FILE__
226
    #   - '-'
227

  
228
    # apt install unrtf
229
    # unrtf:
230
    #   - /usr/bin/unrtf
231
    #   - --text
232
    #   - __FILE__
233

  
234
    # apt install catdoc
235
    # catdoc:
236
    #   - /usr/bin/catdoc
237
    #   - -dutf-8
238
    #   - __FILE__
239
    # xls2csv:
240
    #   - /usr/bin/xls2csv
241
    #   - -dutf-8
242
    #   - __FILE__
243
    # catppt:
244
    #   - /usr/bin/catppt
245
    #   - -dutf-8
246
    #   - __FILE__
247

  
215 248
# specific configuration options for production environment
216 249
# that overrides the default ones
217 250
production:
config/locales/en.yml
1133 1133
  text_plugin_assets_writable: Plugin assets directory writable
1134 1134
  text_rmagick_available: RMagick available (optional)
1135 1135
  text_convert_available: ImageMagick convert available (optional)
1136
  text_pdftotext_available: Pdftotext available (optional)
1137
  text_unrtf_available: Unrtf available (optional)
1138
  text_catdoc_available: Catdoc available (optional)
1139
  text_xls2csv_available: Xls2csv available (optional)
1140
  text_catppt_available: Catppt available (optional)
1136 1141
  text_destroy_time_entries_question: "%{hours} hours were reported on the issues you are about to delete. What do you want to do?"
1137 1142
  text_destroy_time_entries: Delete reported hours
1138 1143
  text_assign_time_entries_to_project: Assign reported hours to the project
lib/redmine/text_extractor.rb
1
module Redmine
2
  class TextExtractor
3

  
4
    MAX_FULLTEXT_LENGTH = 4.megabytes
5
    TEXT_EXTRACTORS = Redmine::Configuration['text_extractors'] || {}
6

  
7
    def initialize(attachment)
8
      @attachment = attachment
9
    end
10

  
11
    # returns the extracted fulltext or nil if no matching handler was found
12
    # for the file type.
13
    def text
14
      if handler = find_handler and text = handler.text(@attachment)
15
        text.gsub! /\s+/m, ' '
16
        text.strip!
17
        text.mb_chars.compose.limit(MAX_FULLTEXT_LENGTH).to_s
18
      end
19
    rescue Exception => e
20
      Rails.logger.error "error in fulltext extraction: #{e}"
21
      raise e unless e.is_a? StandardError # re-raise Signals / SyntaxErrors etc
22
    end
23

  
24
    private
25

  
26
    def find_handler
27
      @@file_handlers.detect{|h| h.accept? @attachment }
28
    end
29

  
30

  
31
    class FileHandler
32
      def accept?(attachment)
33
        if @content_type
34
          attachment.content_type == @content_type
35
        elsif @content_types
36
          @content_types.include? attachment.content_type
37
        else
38
          false
39
        end
40
      end
41
    end
42

  
43
    class ExternalCommandHandler < FileHandler
44
      include Redmine::Utils::Shell
45

  
46
      FILE_PLACEHOLDER = '__FILE__'.freeze
47

  
48
      def text(attachment)
49
        cmd = @command.dup
50
        cmd[cmd.index(FILE_PLACEHOLDER)] = attachment.diskfile
51
        shellout(cmd){ |io| io.read }.to_s
52
      end
53

  
54
      def accept?(attachment)
55
        super and available?
56
      end
57

  
58
      def available?
59
        @command.present? and File.executable?(@command[0])
60
      end
61

  
62
      def self.available?
63
        new.available?
64
      end
65
    end
66

  
67

  
68
    class PdfHandler < ExternalCommandHandler
69
      DEFAULT = [
70
        '/usr/bin/pdftotext', '-enc', 'UTF-8', '__FILE__', '-'
71
      ].freeze
72
      def initialize
73
        @content_type = 'application/pdf'
74
        @command = TEXT_EXTRACTORS['pdftotext'] || DEFAULT
75
      end
76
    end
77

  
78

  
79
    class RtfHandler < ExternalCommandHandler
80
      DEFAULT = [
81
        '/usr/bin/unrtf', '--text', '__FILE__'
82
      ].freeze
83
      def initialize
84
        @content_type = 'application/rtf'
85
        @command = TEXT_EXTRACTORS['unrtf'] || DEFAULT
86
      end
87
    end
88

  
89

  
90
    # Handler base class for XML based (MS / Open / Libre) office documents.
91
    class ZippedXmlHandler < FileHandler
92

  
93
      class SaxDocument < Nokogiri::XML::SAX::Document
94
        attr_reader :text
95

  
96
        def initialize(text_element, text_namespace)
97
          @element = text_element
98
          @namespace_uri = text_namespace
99
          @text = ''.dup
100
          @is_text = false
101
        end
102

  
103
        # Handle each element, expecting the name and any attributes
104
        def start_element_namespace(name, attrs = [], prefix = nil, uri = nil, ns = [])
105
          if name == @element and uri == @namespace_uri
106
            @is_text = true
107
          end
108
        end
109

  
110
        # Any characters between the start and end element expected as a string
111
        def characters(string)
112
          @text << string if @is_text
113
        end
114

  
115
        # Given the name of an element once its closing tag is reached
116
        def end_element_namespace(name, prefix = nil, uri = nil)
117
          if name == @element and uri == @namespace_uri
118
            @text << ' '
119
            @is_text = false
120
          end
121
        end
122
      end
123

  
124
      def text(attachment)
125
        Zip::File.open(attachment.diskfile) do |zip_file|
126
          zip_file.each do |entry|
127
            if entry.name == @file
128
              return xml_to_text entry.get_input_stream
129
            end
130
          end
131
        end
132
      end
133

  
134
      private
135

  
136
      def xml_to_text(io)
137
        sax_doc = SaxDocument.new @element, @namespace_uri
138
        Nokogiri::XML::SAX::Parser.new(sax_doc).parse(io)
139
        sax_doc.text
140
      end
141
    end
142

  
143

  
144
    # Base class for extractors for MS Office formats
145
    class OfficeDocumentHandler < ZippedXmlHandler
146
      def initialize
147
        super
148
        @element = 't'
149
      end
150
    end
151

  
152

  
153
    class DocxHandler < OfficeDocumentHandler
154
      def initialize
155
        super
156
        @content_type = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
157
        @file = 'word/document.xml'
158
        @namespace_uri = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
159
      end
160
    end
161

  
162

  
163
    class XlsxHandler < OfficeDocumentHandler
164
      def initialize
165
        super
166
        @content_type = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'
167
        @file = 'xl/sharedStrings.xml'
168
        @namespace_uri = 'http://schemas.openxmlformats.org/spreadsheetml/2006/main'
169
      end
170
    end
171

  
172

  
173

  
174
    class PptxHandler < OfficeDocumentHandler
175
      CONTENT_TYPES = [
176
        'application/vnd.openxmlformats-officedocument.presentationml.presentation',
177
        'application/vnd.openxmlformats-officedocument.presentationml.slideshow',
178
        'application/vnd.ms-powerpoint.template.macroEnabled.12',
179
      ]
180

  
181
      def initialize
182
        super
183
        @content_types = CONTENT_TYPES
184
        @namespace_uri = 'http://schemas.openxmlformats.org/drawingml/2006/main'
185
      end
186

  
187
      def text(attachment)
188
        slides = []
189
        Zip::File.open(attachment.diskfile) do |zip_file|
190
          zip_file.each do |entry|
191
            if entry.name =~ /slide(\d+)\.xml/
192
              slides << [$1, xml_to_text(entry.get_input_stream)]
193
            end
194
          end
195
        end
196
        slides.sort!{|a, b| a.first <=> b.first}
197
        slides.map(&:last).join ' '
198
      end
199
    end
200

  
201

  
202
    # Extractor for Open / Libre Office formats
203
    class OpendocumentHandler < ZippedXmlHandler
204
      CONTENT_TYPES = [
205
        'application/vnd.oasis.opendocument.presentation',
206
        'application/vnd.oasis.opendocument.presentation-template',
207
        'application/vnd.oasis.opendocument.text',
208
        'application/vnd.oasis.opendocument.text-template',
209
        'application/vnd.oasis.opendocument.spreadsheet',
210
        'application/vnd.oasis.opendocument.spreadsheet-template'
211
      ]
212
      def initialize
213
        super
214
        @file = 'content.xml'
215
        @content_types = CONTENT_TYPES
216
        @element = 'p'
217
        @namespace_uri = 'urn:oasis:names:tc:opendocument:xmlns:text:1.0'
218
      end
219
    end
220

  
221

  
222

  
223
    class DocHandler < ExternalCommandHandler
224
      CONTENT_TYPES = [
225
        'application/vnd.ms-word',
226
        'application/msword',
227
      ]
228
      DEFAULT = [
229
        '/usr/bin/catdoc', '-dutf-8', '__FILE__'
230
      ]
231
      def initialize
232
        @content_types = CONTENT_TYPES
233
        @command = TEXT_EXTRACTORS['catdoc'] || DEFAULT
234
      end
235
    end
236

  
237
    class XlsHandler < ExternalCommandHandler
238
      CONTENT_TYPES = [
239
        'application/vnd.ms-excel',
240
        'application/excel',
241
      ]
242
      DEFAULT = [
243
        '/usr/bin/xls2csv', '-dutf-8', '__FILE__'
244
      ]
245
      def initialize
246
        @content_types = CONTENT_TYPES
247
        @command = TEXT_EXTRACTORS['xls2csv'] || DEFAULT
248
      end
249
      def text(*_)
250
        if str = super
251
          str.delete('"').gsub /,+/, ' '
252
        end
253
      end
254
    end
255

  
256
    class PptHandler < ExternalCommandHandler
257
      CONTENT_TYPES = [
258
        'application/vnd.ms-powerpoint',
259
        'application/powerpoint',
260
      ]
261
      DEFAULT = [
262
        '/usr/bin/catppt', '-dutf-8', '__FILE__'
263
      ]
264
      def initialize
265
        @content_types = CONTENT_TYPES
266
        @command = TEXT_EXTRACTORS['catppt'] || DEFAULT
267
      end
268
    end
269

  
270

  
271
    class PlaintextHandler < FileHandler
272
      CONTENT_TYPES = %w(text/csv text/plain)
273
      def initialize
274
        @content_types = CONTENT_TYPES
275
      end
276
      def text(attachment)
277
        Redmine::CodesetUtil.to_utf8 IO.read(attachment.diskfile), 'UTF-8'
278
      end
279
    end
280

  
281
    # the handler chain. List most specific handlers first and more general
282
    # (fallback) handlers later.
283
    @@file_handlers = [
284
      PdfHandler,
285
      OpendocumentHandler,
286
      DocxHandler, XlsxHandler, PptxHandler,
287
      DocHandler, XlsHandler, PptHandler,
288
      RtfHandler,
289
      PlaintextHandler,
290
    ].map(&:new)
291

  
292
  end
293

  
294
end
295

  
... This diff was truncated because it exceeds the maximum size that can be displayed.
(5-5/11)