Finish Drupal scraper

This commit is contained in:
Thibaut 2015-07-05 15:33:20 -04:00
parent dac9b9b03c
commit 79822f8ebc
14 changed files with 60 additions and 56 deletions

Binary file not shown.

Before

Width:  |  Height:  |  Size: 29 KiB

After

Width:  |  Height:  |  Size: 29 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 75 KiB

After

Width:  |  Height:  |  Size: 76 KiB

View file

@ -1,5 +1,8 @@
[
[
"2015-07-05",
"New documentation: <a href=\"/drupal/\">Drupal</a>"
], [
"2015-05-24",
"New <a href=\"/rust/\">Rust</a> documentation"
], [

View file

@ -135,6 +135,11 @@ credits = [
'Django Software Foundation and individual contributors',
'BSD',
'https://raw.githubusercontent.com/django/django/master/LICENSE'
], [
'Drupal',
'2001-2015 by the original authors<br>Drupal is a registered trademark of Dries Buytaert.',
'GPLv2',
'https://api.drupal.org/api/drupal/LICENSE.txt'
], [
'Ember.js',
'2015 Yehuda Katz, Tom Dale and Ember.js contributors',
@ -220,11 +225,6 @@ credits = [
'2004 John Gruber',
'BSD',
'http://daringfireball.net/projects/markdown/license'
], [
'MaxCDN',
'2015 MaxCDN',
'MIT',
'https://raw.githubusercontent.com/MaxCDN/api-docs/master/LICENSE'
], [
'Meteor',
'2011-2015 Meteor Development Group',

View file

@ -0,0 +1,6 @@
#= require views/pages/base
class app.views.DrupalPage extends app.views.BasePage
afterRender: ->
@highlightCode @findAll('pre.php'), 'php'
return

View file

@ -37,6 +37,7 @@
'pages/clojure',
'pages/coffeescript',
'pages/d3',
'pages/drupal',
'pages/ember',
'pages/express',
'pages/git',

View file

@ -4,7 +4,7 @@
width: 1rem;
height: 1rem;
background-image: image-url('icons.png');
background-size: 10rem 8rem;
background-size: 10rem 9rem;
}
@media (-webkit-min-device-pixel-ratio: 1.5), (min-resolution: 144dpi) {
@ -101,3 +101,4 @@
._icon-meteor:before { background-position: -7rem -7rem; @extend %darkIconFix !optional; }
._icon-npm:before { background-position: -8rem -7rem; }
._icon-apache_http_server:before { background-position: -9rem -7rem; }
._icon-drupal:before { background-position: 0 -8rem; }

View file

@ -1,10 +1,4 @@
._drupal {
h1#page-subtitle {
margin-top: 0;
@extend %lined-heading;
}
h3 { @extend %block-heading; }
.signature { @extend %note, %note-blue; }
}
}

View file

@ -35,6 +35,11 @@ module Docs
def normalize_path(path)
path = path.downcase
if context[:decode_and_clean_paths]
path = URI.unescape(path)
path.gsub! %r{[!;:]+}, '-'
end
if path == '.'
'index'
elsif path.end_with? '/'

View file

@ -11,11 +11,23 @@ module Docs
end
def other
css('#page-title-tools', '.element-invisible', '.breadcrumb', '#sidebar-first', '#api-alternatives').remove
css('#aside', '#api-function-signature tr:not(.active)', '.comments').remove
css('.element-invisible', '#sidebar-first', '#api-alternatives', '#aside', '.comments', '.view-filters',
'#api-function-signature tr:not(.active)', '.ctools-collapsible-container', 'img[width="13"]').remove
at_css('#main').replace(at_css('.content'))
at_css('#page-heading').replace(at_css('#page-subtitle'))
css('th.views-field > a', '.content').each do |node|
node.before(node.children).remove
end
css('pre').each do |node|
node.content = node.content
end
# Replaces the signature table from api.drupal.org with a simple pre tag
css('#api-function-signature').each do |table|
signature = table.css('.signature').first.inner_html
signature = table.css('.signature').first.at_css('code').inner_html
table.replace '<pre class="signature">' + signature + '</pre>'
end
end

View file

@ -1,20 +1,15 @@
module Docs
class Drupal
class EntriesFilter < Docs::EntriesFilter
def get_name
name = css('#page-subtitle').first.content
name.remove! 'function '
name = at_css('#page-subtitle').content
name.remove! %r{(abstract|public|static|protected|final|function|class)\s+}
name
end
def path
Drupal::fixUri(result[:path])
end
def get_type
type = css('dl[api-related-topics] dt')
type.first ? type.first.content : nil
type = css('.breadcrumb > a')[1].content.strip
type.split('.').first
end
def include_default_entry?

View file

@ -1,38 +1,26 @@
module Docs
class Drupal < UrlScraper
self.name = 'Drupal'
self.type = 'drupal'
self.version = '7.37'
self.version = '7.38'
self.base_url = 'https://api.drupal.org/api/drupal/'
self.initial_paths = %w(
groups
groups?page=1)
html_filters.replace 'normalize_paths', 'drupal/normalize_paths'
html_filters.replace 'internal_urls', 'drupal/internal_urls'
self.initial_paths = %w(groups groups?page=1)
self.links = {
home: 'https://www.drupal.org/'
}
html_filters.push 'drupal/entries', 'drupal/clean_html', 'title'
options[:container] = '#page'
options[:decode_and_clean_paths] = true
options[:container] = '#page-inner'
options[:title] = false
options[:root_title] = 'Drupal - Open Source CMS | Drupal.org'
options[:root_title] = 'Drupal'
options[:only_patterns] = [
/\/class\/[^\/]+/,
/\/group\/[^\/]+/,
/\/function\/[^\/]+/]
options[:skip_link] = ->(link) {
begin
return unless q = URL.parse(link['href']).query
Hash[URI.decode_www_form(q)].has_key? "order"
rescue URI::InvalidURIError
false
end
}
options[:skip] = %w(
'modules-system-system.install/group/updates-7.x-extra/7',
'modules-system-system.install/group/updates-6.x-to-7.x/7')
options[:skip_link] = ->(link) { link['href'] =~ /[\?&]order/ }
options[:skip_patterns] = [
/\/group\/updates\-7/,
@ -40,26 +28,25 @@ module Docs
/_update_[0-9]{4}/, # Skip update functions
/\/[4-6](\.[0-9])*$/, # Skip previous versions
/\/[8-9](\.[0-9])*$/, # Skip future versions
/\/class\/hierarchy\//, # Skip class hierarchy listings
/\/function\/calls\//, # Skip function calls listings
/\/function\/invokes\//, # Skip function invokations listings
/\/function\/overrides\//, # Skip function overrides listings
/\/function\/references\//, # Skip function references listings
/\/function\/implementations\//, # Skip hook implementation listings
/\/function\/theme_references\//, # Skip hook references listings
/\.test\/function\// # Skip test files
]
options[:fix_urls] = ->(url) do
url.sub! /\/7$/, '' # Remove the version indicator from the current version
url.remove! %r{/7$}
url
end
options[:attribution] = <<-HTML
&copy; 2000&ndash;2015 by the individual contributors.<br>
Licensed under the Creative Commons License, Attribution-ShareAlike2.0.<br>
&copy; 2001&ndash;2015 by the original authors<br>
Licensed under the GNU General Public License, version 2 and later.<br>
Drupal is a registered trademark of Dries Buytaert.
HTML
# Method used at several places to fix special characters at urls from api.drupal.org
def self.fixUri(path)
p = path.gsub /%21|!|%2b|%3b|%3a/i, '-' # !+;:
end
end
end

Binary file not shown.

Before

Width:  |  Height:  |  Size: 704 B

After

Width:  |  Height:  |  Size: 547 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.7 KiB

After

Width:  |  Height:  |  Size: 1.2 KiB