Add image scraping and optimization filter

Rel: #633
This commit is contained in:
Thibaut Courouble 2017-07-16 11:09:19 -04:00
parent 6fc48db8af
commit a6855329e8
7 changed files with 144 additions and 1 deletions

24
.image_optim.yml Normal file
View file

@ -0,0 +1,24 @@
verbose: false
skip_missing_workers: true
allow_lossy: true
advpng: false
gifsicle:
interlace: false
level: 3
careful: true
jhead: false
jpegoptim:
strip: all
max_quality: 100
jpegrecompress: false
jpegtran: false
optipng:
level: 3
interlace: false
strip: true
pngcrush: false
pngout: false
pngquant:
quality: !ruby/range 80..99
speed: 3
svgo: false

View file

@ -32,6 +32,8 @@ group :docs do
gem 'typhoeus'
gem 'nokogiri'
gem 'html-pipeline'
gem 'image_optim'
gem 'image_optim_pack', platforms: :ruby
gem 'progress_bar', require: false
gem 'unix_utils', require: false
gem 'tty-pager', require: false

View file

@ -25,12 +25,25 @@ GEM
ffi (>= 1.3.0)
eventmachine (1.2.3)
execjs (2.7.0)
exifr (1.3.1)
ffi (1.9.18)
fspath (3.1.0)
highline (1.7.8)
html-pipeline (2.6.0)
activesupport (>= 2)
nokogiri (>= 1.4)
i18n (0.8.4)
image_optim (0.25.0)
exifr (~> 1.2, >= 1.2.2)
fspath (~> 3.0)
image_size (~> 1.5)
in_threads (~> 1.3)
progress (~> 3.0, >= 3.0.1)
image_optim_pack (0.5.0.20170712)
fspath (>= 2.1, < 4)
image_optim (~> 0.19)
image_size (1.5.0)
in_threads (1.4.0)
method_source (0.8.2)
mini_portile2 (2.2.0)
minitest (5.10.2)
@ -39,6 +52,7 @@ GEM
nokogiri (1.8.0)
mini_portile2 (~> 2.2.0)
options (2.3.2)
progress (3.3.1)
progress_bar (1.1.0)
highline (~> 1.6)
options (~> 2.3.0)
@ -109,6 +123,8 @@ DEPENDENCIES
coffee-script
erubi
html-pipeline
image_optim
image_optim_pack
minitest
nokogiri
progress_bar

View file

@ -20,6 +20,7 @@ module Docs
def initialize(options = {})
@request_options = options.extract!(:request_options)[:request_options].try(:dup) || {}
options[:max_concurrency] ||= 20
options[:pipelining] = 0
super
end

View file

@ -0,0 +1,72 @@
# frozen_string_literal: true
module Docs
class ImagesFilter < Filter
include Instrumentable
def self.optimize_image_data(data)
@image_optim ||= ImageOptim.new
@image_optim.optimize_image_data(data)
end
def call
@@cache ||= {}
doc.css('img[src]').each do |node|
src = node['src']
if @@cache.key?(src)
node['src'] = @@cache[src] unless @@cache[src] == false
next
end
@@cache[src] = false
url = Docs::URL.parse(src)
url.scheme = 'https' if url.scheme.nil?
next unless url.scheme == 'http' || url.scheme == 'https'
begin
Request.run(url) do |response|
unless response.success?
instrument 'broken.image', url: url, status: response.code
next
end
unless response.mime_type.start_with?('image/')
instrument 'invalid.image', url: url, content_type: response.mime_type
next
end
image = response.body
unless context[:optimize_images] == false
image = self.class.optimize_image_data(image) || image
end
size = image.bytesize
if size > max_size
instrument 'too_big.image', url: url, size: size
next
end
image = Base64.strict_encode64(image)
image.prepend "data:#{response.mime_type};base64,"
node['src'] = @@cache[src] = image
end
rescue => exception
instrument 'error.image', url: url, exception: exception
end
end
doc
end
private
def max_size
@max_size ||= context[:max_image_size] || 100.kilobytes
end
end
end

View file

@ -0,0 +1,27 @@
# frozen_string_literal: true
module Docs
class ImageSubscriber < Subscriber
self.namespace = 'image'
def broken(event)
log "Skipped broken image (#{event.payload[:code]}): #{event.payload[:url]}"
end
def invalid(event)
log "Skipped invalid image (#{event.payload[:content_type]}): #{event.payload[:url]}"
end
def too_big(event)
log "Skipped large image (#{(event.payload[:size] / 1.kilobyte.to_f).round} KB): #{event.payload[:url]}"
end
def error(event)
exception = event.payload[:exception]
log "ERROR: #{event.payload[:url]}"
puts " #{exception.class}: #{exception.message.gsub("\n", "\n ")}"
puts exception.backtrace.select { |line| line.start_with?(Docs.root_path) }.join("\n ").prepend("\n ")
puts "\n"
end
end
end

View file

@ -35,6 +35,7 @@ class DocsCLI < Thor
return puts 'ERROR: [path] must be an absolute path.'
end
Docs.install_report :image
Docs.install_report :store if options[:verbose]
if options[:debug]
GC.disable
@ -61,7 +62,7 @@ class DocsCLI < Thor
Docs.rescue_errors = true
Docs.install_report :store if options[:verbose]
Docs.install_report :scraper if options[:debug]
Docs.install_report :progress_bar, :doc if $stdout.tty?
Docs.install_report :progress_bar, :doc, :image if $stdout.tty?
require 'unix_utils' if options[:package]