Finish Padrino scraper

This commit is contained in:
Thibaut Courouble 2016-06-05 18:37:37 -04:00
parent e1e4a626ff
commit 481233050d
15 changed files with 76 additions and 29 deletions

Binary file not shown.

Before

Width:  |  Height:  |  Size: 45 KiB

After

Width:  |  Height:  |  Size: 46 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 118 KiB

After

Width:  |  Height:  |  Size: 119 KiB

View file

@ -1,7 +1,7 @@
[
[
"2016-06-05",
"New documentation: <a href=\"/kotlin/\">Kotlin</a>"
"New documentation: <a href=\"/kotlin/\">Kotlin</a> and <a href=\"/padrino/\">Padrino</a>"
], [
"2016-04-24",
"New documentations: <a href=\"/numpy/\">NumPy</a> and <a href=\"/apache_pig/\">Apache Pig</a>"

View file

@ -350,6 +350,11 @@ credits = [
'2010-2016 The OpenTSDB Authors',
'LGPLv2.1',
'https://raw.githubusercontent.com/OpenTSDB/opentsdb.net/gh-pages/COPYING.LESSER'
], [
'Padrino',
'2010-2016 Padrino',
'MIT',
'https://raw.githubusercontent.com/padrino/padrino-framework/master/padrino/LICENSE.txt'
], [
'Perl',
'1993-2016 Larry Wall and others',

View file

@ -29,6 +29,7 @@ app.views.PostgresPage =
app.views.RamdaPage =
app.views.ReactPage =
app.views.RethinkdbPage =
app.views.RubydocPage =
app.views.SinonPage =
app.views.SocketioPage =
app.views.SphinxSimplePage =

View file

@ -73,6 +73,7 @@
'pages/requirejs',
'pages/rethinkdb',
'pages/rfc',
'pages/rubydoc',
'pages/rust',
'pages/socketio',
'pages/sphinx',

View file

@ -73,6 +73,7 @@
'pages/requirejs',
'pages/rethinkdb',
'pages/rfc',
'pages/rubydoc',
'pages/rust',
'pages/socketio',
'pages/sphinx',

View file

@ -140,3 +140,4 @@
._icon-apache_pig:before { background-position: -4rem -11rem; }
._icon-numpy:before { background-position: -5rem -11rem; }
._icon-kotlin:before { background-position: -6rem -11rem; }
._icon-padrino:before { background-position: -7rem -11rem; }

View file

@ -0,0 +1,9 @@
._rubydoc {
@extend %simple;
p.note { @extend %note; }
span.note { @extend %label; }
span.note.private { @extend %label-red; }
h4 + ul { margin-top: 1em; }
}

View file

@ -2,9 +2,37 @@ module Docs
class Padrino
class CleanHtmlFilter < Filter
def call
css('.summary_toggle').remove
css('.inheritanceTree').remove
at_css('#content')
css('.summary_toggle', '.inheritanceTree', 'h1 .note', '.source_code', '.box_info dl:last-child').remove
css('a[href*="travis"]', 'a[href*="gemnasium"]', 'a[href*="codeclimate"]', 'a[href*="gitter"]').remove if root_page?
css('.signature').each do |node|
node.name = 'h3'
end
css('.permalink', 'div.docstring', 'div.discussion', '.method_details_list', '.attr_details',
'h3 strong', 'h3 a', 'h3 tt', 'h3 span', 'div.inline p', 'div.inline').each do |node|
node.before(node.children).remove
end
css('.tag_title').each do |node|
node.name = 'h4'
end
css('span.summary_signature', 'tt', '.tags span.name').each do |node|
node.name = 'code'
node.inner_html = node.inner_html.strip
end
css('code > a').each do |node|
node.inner_html = node.inner_html.strip
end
css('pre.code').each do |node|
node.content = node.content
node['data-language'] = 'ruby'
end
doc
end
end
end

View file

@ -2,34 +2,28 @@ module Docs
class Padrino
class EntriesFilter < Docs::EntriesFilter
def get_name
name = at_css('h1, h2').content
name.remove! 'Class: '
name.remove! 'Module: '
at_css('h1').content.split(' ').last
end
def get_type
type = name.dup
type.remove! %r{#.+\z}
type.split('::')[0..2].join('::')
name.split('::')[0..1].join('::')
end
def additional_entries
return [] if root_page?
require 'cgi'
return [] if initial_page?
css('.summary_signature').inject [] do |entries, node|
name = node.children[1].attributes['title'].value
name = CGI.unescape(name)
unless name.start_with?('_')
name.prepend self.name
entries << [name, self.name.gsub('::','/').downcase.strip + node.children[1].attributes['href'].value.slice(/\#.*/)] unless entries.any? { |entry| entry[0] == name }
end
entries
css('.signature').each_with_object [] do |node, entries|
next if node.ancestors('.overload').present?
name = node.content.strip
name.remove! %r{[\s\(].*}
name.prepend(self.name)
entries << [name, node['id']]
end
end
def include_default_entry?
!initial_page?
end
end
end
end

View file

@ -1,11 +1,11 @@
module Docs
class Padrino < UrlScraper
self.name = 'padrino'
self.slug = 'padrino'
self.type = 'ruby'
self.version = 'master'
self.type = 'rubydoc'
self.release = '0.13.2'
self.base_url = 'http://www.rubydoc.info/github/padrino/padrino-framework'
self.base_url = 'http://www.rubydoc.info/github/padrino/padrino-framework/'
self.root_path = 'file/README.rdoc'
self.initial_paths = %w(index2)
self.links = {
home: 'http://padrinorb.com/',
code: 'https://github.com/padrino/padrino-framework'
@ -13,9 +13,15 @@ module Docs
html_filters.push 'padrino/clean_html', 'padrino/entries'
options[:container] = ->(filter) { filter.root_page? ? '#filecontents' : '#content' }
options[:attribution] = <<-HTML
&copy; Padrino contributors<br>
Licensed under the Creative Commons Attribution License.
&copy; 2010&ndash;2016 Padrino<br>
Licensed under the MIT License.
HTML
stub 'index2' do
request_one(url_for('index')).body
end
end
end

Binary file not shown.

After

Width:  |  Height:  |  Size: 414 B

Binary file not shown.

After

Width:  |  Height:  |  Size: 861 B

View file

@ -0,0 +1 @@
https://raw.githubusercontent.com/padrino/padrino-web/master/source/images/favicon.ico