class ChupaText::Data
Attributes
@return [Attributes] The attributes of the data.
@return [String, nil] The content of the data, ‘nil` if the data
doesn't have any content.
@return [Array<Integer, Integer>] the expected screenshot size.
@return [Numeric, String, nil] the max memory on extraction by
external command.
@return [Numeric, String, nil] the max CPU time on extraction by
external command.
@return [Integer, nil] the max body size in bytes.
@param [Bool] value ‘true` when screenshot is needed. @return [Bool] the specified value
@return [String, nil] The path associated with the content of
the data, `nil` if the data doesn't associated with any file. The path may not be related with the original content. For example, `"/tmp/XXX.txt"` may be returned for the data of `"http://example.com/XXX.txt"`. This value is useful to use an external command to extract text and meta-data.
@return [Screenshot, nil] The screenshot of the data. For example,
the first page image for PDF file.text.
@return [Integer, nil] The byte size of the data, ‘nil` if the data
doesn't have any content.
@return [Data, nil] The source of the data. For example, text
data (`hello.txt`) in archive data (`hello.tar`) have the archive data in {#source}.
@return [Numeric, String, nil] the timeout on extraction.
@return [URI, nil] The URI of the data if the data is for remote
or local file, `nil` if the data isn't associated with any URIs.
Public Class Methods
Source
# File lib/chupa-text/data.rb, line 82 def initialize(options={}) @uri = nil @body = nil @size = nil @path = nil @mime_type = nil @attributes = Attributes.new @source = nil @screenshot = nil @need_screenshot = true @expected_screenshot_size = [200, 200] @max_body_size = nil @timeout = nil @limit_cpu = nil @limit_as = nil @options = options || {} source_data = @options[:source_data] if source_data merge!(source_data) @source = source_data end end
Public Instance Methods
Source
# File lib/chupa-text/data.rb, line 171 def []=(name, value) @attributes[name] = value end
Source
# File lib/chupa-text/data.rb, line 193 def extension return nil if @uri.nil? if @uri.is_a?(URI::HTTP) and @uri.path.end_with?("/") "html" else File.extname(@uri.path).downcase.gsub(/\A\./, "") end end
@return [String, nil] Normalized extension as String if {#uri}
is not `nil`, `nil` otherwise. The normalized extension uses lower case like `pdf` not `PDF`.
Source
# File lib/chupa-text/data.rb, line 105 def initialize_copy(object) super @attributes = @attributes.dup self end
Source
# File lib/chupa-text/data.rb, line 116 def merge!(data) self.uri = data.uri self.path = data.path data.attributes.each do |name, value| self[name] = value end if data.mime_type self["source-mime-types"] ||= [] self["source-mime-types"].unshift(data.mime_type) end self.need_screenshot = data.need_screenshot? self.expected_screenshot_size = data.expected_screenshot_size self.max_body_size = data.max_body_size self.timeout = data.timeout self.limit_cpu = data.limit_cpu self.limit_as = data.limit_as end
Merges metadata from data.
@param [Data] data The data to be merged.
@return [void]
Source
# File lib/chupa-text/data.rb, line 179 def mime_type @mime_type || guess_mime_type end
@return [String] The MIME type of the data. If MIME type
isn't set, guesses MIME type from path and body.
@return [nil] If MIME type isn’t set and it can’t guess MIME type
from path and body.
Source
# File lib/chupa-text/data.rb, line 186 def mime_type=(type) @mime_type = type end
@param [String, nil] type The MIME type of the data. You can
unset MIME type by `nil`. If you unset MIME type, MIME type is guessed from path and body of the data.
Source
# File lib/chupa-text/data.rb, line 215 def need_screenshot? @need_screenshot end
@return [Bool] ‘true` when screenshot is needed if available.
Source
# File lib/chupa-text/data.rb, line 154 def open yield(StringIO.new(body)) end
Source
# File lib/chupa-text/data.rb, line 161 def peek_body(size) _body = body return nil if _body.nil? _body[0, size] end
Source
# File lib/chupa-text/data.rb, line 204 def text? (mime_type || "").start_with?("text/") end
@return [Bool] true if MIME type is “text/XXX”, false
otherwise.
Source
# File lib/chupa-text/data.rb, line 210 def text_plain? mime_type == "text/plain" end
@return [Bool] true if MIME type is “text/plain”, false
otherwise.
Source
# File lib/chupa-text/data.rb, line 219 def to_utf8_body_data b = nil if @max_body_size open do |input| b = input.read(@max_body_size) end else b = body end return self if b.nil? converter = UTF8Converter.new(b) utf8_body = converter.convert if @max_body_size.nil? and b.equal?(utf8_body) self else TextData.new(utf8_body, source_data: self) end end
Source
# File lib/chupa-text/data.rb, line 136 def uri=(uri) case uri when Pathname file_uri = convert_pathname_to_file_uri(uri) @uri = URI.parse(file_uri) self.path = uri when NilClass @uri = nil self.path = nil else unless uri.is_a?(URI) uri = URI.parse(uri) end @uri = uri self.path = @uri.path end end
@param [String, URI, nil] uri The URI for the data. If ‘uri` is
`nil`, it means that the data isn't associated with any URIs.
Private Instance Methods
Source
# File lib/chupa-text/data.rb, line 276 def change_encoding(string, encoding) return if string.nil? begin original_encoding = string.encoding string.force_encoding(encoding) yield(string) ensure string.force_encoding(original_encoding) end end
Source
# File lib/chupa-text/data.rb, line 240 def convert_pathname_to_file_uri(path) components = [] escaped_path = nil target = path.expand_path loop do target, base = target.split components.unshift(CGI.escape(base.to_s)) if target.root? escaped_path = target + components.join("/") break end end "file://#{escaped_path}" end
Source
# File lib/chupa-text/data.rb, line 255 def guess_mime_type guess_mime_type_from_uri or guess_mime_type_from_body end
Source
# File lib/chupa-text/data.rb, line 264 def guess_mime_type_from_body mime_type = nil chunk = peek_body(1024) change_encoding(chunk, "UTF-8") do |utf8_chunk| return nil unless utf8_chunk.valid_encoding? n_null_characters = utf8_chunk.count("\u0000") return nil if n_null_characters > (utf8_chunk.bytesize * 0.01) mime_type = "text/plain" end mime_type end
Source
# File lib/chupa-text/data.rb, line 260 def guess_mime_type_from_uri MIMEType.registry.find(extension) end