Update and improve Sphinx scrapers

This commit is contained in:
Thibaut Courouble 2016-07-03 18:45:29 -04:00
parent 1412517869
commit 562463b112
20 changed files with 141 additions and 308 deletions

View file

@ -36,6 +36,7 @@ app.views.RethinkdbPage =
app.views.RubydocPage =
app.views.SinonPage =
app.views.SocketioPage =
app.views.SphinxPage =
app.views.SphinxSimplePage =
app.views.TensorflowPage =
app.views.TypescriptPage =

View file

@ -1,8 +0,0 @@
#= require views/pages/base
class app.views.SphinxPage extends app.views.BasePage
prepare: ->
@highlightCode @findAll('pre.python'), 'python'
@highlightCode @findAll('pre.markup'), 'markup'
@highlightCode @findAll('pre.php'), 'php'
return

View file

@ -2,15 +2,24 @@
h2, h3 { @extend %block-heading; }
h4 { font-size: 1em; }
> dl:not(.docutils) > dt { @extend %block-label, %label-blue; }
dl > dl > dt { @extend %block-label; }
dd > dl:not(.docutils) > dt { @extend %block-label; }
dt + dt { margin-top: -.5em; }
.note, .admonition, .versionadded, .versionchanged, .deprecated-removed { @extend %note; }
.important { @extend %note-orange; }
.warning, .deprecated-removed { @extend %note-red; }
.versionmodified { font-weight: bold; }
.note, .admonition, div.versionadded, div.versionchanged, .deprecated-removed, .deprecated { @extend %note; }
p > code, li > code, dd > code { @extend %label; }
.important { @extend %note-orange; }
.warning, .deprecated-removed, .deprecated { @extend %note-red; }
.versionmodified, span.title {
display: block;
font-weight: bold;
}
p > code, li > code, dd > code, .docutils > dt > code { @extend %label; }
ul.simple { margin: 1em 0; }
h2 > a, h3 > a, dt[id] > a.external { float: right; }
.admonition-title {
float: left;
@ -20,15 +29,13 @@
&:after { content: ':'; }
}
.admonition > dl {
.admonition > dl, .admonition > ul {
clear: left;
margin: 0;
}
.admonition-title + dl { padding-top: .5em; }
ul.simple { margin: 1em 0; }
h2 > a, h3 > a, dt[id] > a.external { float: right; }
td > div { margin: 0 !important; }
}
._sphinx {

View file

@ -4,14 +4,6 @@ module Docs
def call
@doc = at_css('#page-content')
css('blockquote > div > pre:first-child:last-child', 'blockquote > div > ul:first-child:last-child').each do |node|
node.ancestors('blockquote').first.before(node).remove
end
css('a > em').each do |node|
node.before(node.children).remove
end
doc
end
end

View file

@ -2,8 +2,6 @@ module Docs
class Cmake
class CleanHtmlFilter < Filter
def call
css('.headerlink', '#contents .topic-title').remove
if root_page?
css('#release-notes', '#index-and-search').remove
@ -12,38 +10,6 @@ module Docs
end
end
css('.contents > ul.simple > li:first-child:last-child').each do |node|
node.parent.before(node.at_css('> ul'))
node.remove
end
css('.toc-backref', '.toctree-wrapper', '.contents', 'span.pre', 'pre a > code').each do |node|
node.before(node.children).remove
end
css('div[class*="highlight-"]').each do |node|
pre = node.at_css('pre')
pre.content = pre.content
node.replace(pre)
end
css('span[id]:empty').each do |node|
node.next_element['id'] = node['id']
node.remove
end
css('.section').each do |node|
if node['id']
if node.first_element_child['id']
node.element_children[1]['id'] = node['id']
else
node.first_element_child['id'] = node['id']
end
end
node.before(node.children).remove
end
doc
end
end

View file

@ -1,31 +0,0 @@
module Docs
class Codeigniter
class CleanHtmlFilter < Filter
def call
css('.headerlink').remove
css('h1', 'h2', 'h3', 'h4', 'h5', 'pre').each do |node|
node.content = node.content
end
css('div[class^="highlight-"]').each do |node|
node.content = node.content.strip
node.name = 'pre'
node['class'] = 'php' if node['class'].include?('highlight-ci')
end
css('table').each do |node|
node.remove_attribute 'border'
node.remove_attribute 'cellpadding'
end
css('.section').each do |node|
node.first_element_child['id'] = node['id'] if node['id']
node.before(node.children).remove
end
doc
end
end
end
end

View file

@ -2,7 +2,9 @@ module Docs
class Codeigniter
class EntriesFilter < Docs::EntriesFilter
def get_name
at_css('h1').content.strip
name = at_css('h1').content.strip
name.remove! "\u{00B6}"
name
end
def get_type

View file

@ -4,43 +4,6 @@ module Docs
def call
@doc = at_css('.yui-g')
css('.section', 'a > em').each do |node|
node.before(node.children).remove
end
css('tt', 'span.pre').each do |node|
node.name = 'code'
node.content = node.content
node.remove_attribute 'class'
end
css('.headerlink').each do |node|
id = node['href'][1..-1]
node.parent['id'] ||= id
doc.at_css("span##{id}").try(:remove)
node.remove
end
css('h1', 'h2', 'h3', 'dt').each do |node|
links = node.css('a').remove
node.content = node.content
node << links
end
css('div[class^="highlight-"]').each do |node|
node.name = 'pre'
node['class'] = node['data-language'] = case node['class']
when 'highlight-python', 'highlight-default' then 'python'
when 'highlight-html+django' then 'markup'
else ''
end
node.content = node.at_css('pre').content
end
css('code > code').each do |node|
node.before(node.children).remove
end
doc
end
end

View file

@ -1,67 +0,0 @@
module Docs
class Matplotlib
class CleanHtmlFilter < Filter
def call
css('.headerlink', 'hr').remove
css('.toc-backref', '.toctree-wrapper', '.contents', 'span.pre', 'pre a > code').each do |node|
node.before(node.children).remove
end
css('div[class*="highlight-"]').each do |node|
pre = node.at_css('pre')
pre.content = pre.content
pre['data-language'] = node['class'][/highlight\-(\w+)/, 1]
node.replace(pre)
end
css('span[id]:empty').each do |node|
node.next_element['id'] = node['id']
node.remove
end
css('.section').each do |node|
if node['id']
if node.first_element_child['id']
node.element_children[1]['id'] = node['id']
else
node.first_element_child['id'] = node['id']
end
end
node.before(node.children).remove
end
css('h2 > a > code').each do |node|
node.parent.before(node.content).remove
end
css('dt[id]').each do |node|
node.inner_html = "<code>#{node.content.strip}</code>"
end
css('li > p:first-child:last-child').each do |node|
node.before(node.children).remove
end
css('table[border]').each do |node|
node.remove_attribute 'border'
end
css('code[class]').each do |node|
node.remove_attribute 'class'
end
css('h1').each do |node|
node.content = node.content
end
css('p.rubric').each do |node|
node.name = 'h4'
end
doc
end
end
end
end

View file

@ -4,60 +4,6 @@ module Docs
def call
@doc = at_css('#spc-section-body')
css('colgroup').remove
css('.section', 'a > em', 'dt > tt', 'dt > em', 'dt > big', 'tbody').each do |node|
node.before(node.children).remove
end
css('.headerlink').each do |node|
id = node['href'][1..-1]
node.parent['id'] ||= id
doc.at_css("span##{id}").try(:remove)
node.remove
end
css('tt', 'span.pre').each do |node|
node.name = 'code'
node.content = node.content
node.remove_attribute 'class'
end
css('h1', 'h2', 'h3').each do |node|
node.content = node.content
end
css('p.rubric').each do |node|
node.name = 'h4'
end
css('blockquote > div:first-child:last-child').each do |node|
node.parent.before(node.parent.children).remove
node.before(node.children).remove
end
css('.admonition-example').each do |node|
title = node.at_css('.admonition-title')
title.name = 'h4'
title.remove_attribute 'class'
node.before(node.children).remove
end
css('em.xref').each do |node|
node.name = 'code'
end
css('div[class*="highlight-"]').each do |node|
node.content = node.content.strip
node.name = 'pre'
node['data-language'] = node['class'][/highlight\-(\w+)/, 1]
node['class'] = node['data-language'] # tmp
end
css('table[border]').each do |node|
node.remove_attribute 'border'
end
doc
end
end

View file

@ -4,21 +4,6 @@ module Docs
def call
@doc = at_css '.body'
css('> .section').each do |node|
node.before(node.children).remove
end
# Clean inline code elements
css('tt.literal').each do |node|
node.before(node.children).remove
end
css('tt', 'span.pre').each do |node|
node.name = 'code'
node.remove_attribute 'class'
end
root_page? ? root : other
doc
@ -26,14 +11,9 @@ module Docs
def root
at_css('h1').content = 'Python'
css('> p').remove
end
def other
css('.headerlink', 'hr').remove
# Clean headings
css('h1').each do |node|
node.content = node.content.sub!(/\A[\d\.]+/) do |str|
rgx = /\A#{str}/
@ -43,32 +23,7 @@ module Docs
end
css('h2', 'h3', 'h4').each do |node|
node.css('a').each do |link|
link.before(link.children).remove
end
node.child.content = node.child.content.remove @levelRegexp
end
css('dt').each do |node|
node.content = node.content
end
# Remove blockquotes
css('blockquote').each do |node|
node.before(node.children).remove
end
# Remove code highlighting
css('[class*="highlight-python"]').each do |node|
pre = node.at_css('pre')
pre.content = pre.content
pre['class'] = 'python'
node.replace(pre)
end
# Remove <table> border attribute
css('table[border]').each do |node|
node.remove_attribute 'border'
node.inner_html = node.inner_html.remove @levelRegexp
end
end
end

View file

@ -0,0 +1,97 @@
module Docs
class Sphinx
class CleanHtmlFilter < Filter
def call
css('.headerlink', 'hr', '#contents .topic-title', '#topics .topic-title', 'colgroup').remove
css('.contents > ul:first-child:last-child.simple > li:first-child:last-child').each do |node|
node.parent.before(node.at_css('> ul')) if node.at_css('> ul')
node.remove
end
css('em.xref', 'tt').each do |node|
node.name = 'code'
end
css('.toc-backref', '.toctree-wrapper', '.contents', 'span.pre', 'pre a > code', 'tbody', 'code > code', 'a > em').each do |node|
node.before(node.children).remove
end
css('div[class*="highlight-"]').each do |node|
pre = node.at_css('pre')
pre.content = pre.content
pre['data-language'] = node['class'][/highlight\-(\w+)/, 1]
pre['data-language'] = 'php' if pre['data-language'] == 'ci'
pre['data-language'] = 'markup' if pre['data-language'] == 'html+django'
pre['data-language'] = 'python' if pre['data-language'] == 'default' || pre['data-language'].start_with?('python')
node.replace(pre)
end
css('span[id]:empty').each do |node|
(node.next_element || node.previous_element)['id'] ||= node['id'] if node.next_element || node.previous_element
node.remove
end
css('.section').each do |node|
if node['id']
if node.first_element_child['id']
node.element_children[1]['id'] = node['id'] if node.element_children[1]
else
node.first_element_child['id'] = node['id']
end
end
node.before(node.children).remove
end
css('h2 > a > code').each do |node|
node.parent.before(node.content).remove
end
css('dt').each do |node|
next unless node['id'] || node.at_css('code')
links = []
links << node.children.last.remove while node.children.last.try(:name) == 'a'
node.inner_html = "<code>#{node.content.strip}</code> "
links.reverse_each { |link| node << link }
end
css('li > p:first-child:last-child').each do |node|
node.before(node.children).remove
end
css('blockquote > div:first-child:last-child').each do |node|
node.parent.before(node.parent.children).remove
node.before(node.children).remove
end
css('.admonition-example').each do |node|
title = node.at_css('.admonition-title')
title.name = 'h4'
title.remove_attribute 'class'
node.before(node.children).remove
end
css('table[border]').each do |node|
node.remove_attribute 'border'
node.remove_attribute 'cellpadding'
node.remove_attribute 'cellspacing'
end
css('code[class]').each do |node|
node.remove_attribute 'class'
end
css('h1').each do |node|
node.content = node.content
end
css('p.rubric').each do |node|
node.name = 'h4'
end
doc
end
end
end
end

View file

@ -9,7 +9,7 @@ module Docs
code: 'https://github.com/ansible/ansible'
}
html_filters.push 'ansible/entries', 'ansible/clean_html', 'codeigniter/clean_html'
html_filters.push 'ansible/entries', 'ansible/clean_html', 'sphinx/clean_html'
options[:skip] = %w(
glossary.html

View file

@ -7,7 +7,7 @@ module Docs
code: 'https://cmake.org/gitweb?p=cmake.git;a=summary'
}
html_filters.push 'cmake/clean_html', 'cmake/entries', 'title'
html_filters.push 'cmake/clean_html', 'sphinx/clean_html', 'cmake/entries', 'title'
options[:container] = '.body'
options[:title] = false

View file

@ -9,7 +9,7 @@ module Docs
code: 'https://github.com/bcit-ci/CodeIgniter'
}
html_filters.push 'codeigniter/clean_html', 'codeigniter/entries'
html_filters.push 'codeigniter/entries', 'sphinx/clean_html'
options[:container] = '.document'
@ -36,7 +36,7 @@ module Docs
HTML
version '3.0' do
self.release = '3.0.4'
self.release = '3.0.6'
end
end
end

View file

@ -8,7 +8,7 @@ module Docs
code: 'https://github.com/django/django'
}
html_filters.push 'django/entries', 'django/clean_html'
html_filters.push 'django/entries', 'sphinx/clean_html', 'django/clean_html'
text_filters.push 'django/fix_urls'
options[:container] = '#bd'

View file

@ -8,7 +8,7 @@ module Docs
code: 'https://github.com/matplotlib/matplotlib'
}
html_filters.push 'matplotlib/entries', 'matplotlib/clean_html'
html_filters.push 'matplotlib/entries', 'sphinx/clean_html'
options[:container] = '.body'
options[:skip] = %w(api_changes.html)

View file

@ -9,7 +9,7 @@ module Docs
code: 'https://github.com/numpy/numpy'
}
html_filters.push 'numpy/entries', 'numpy/clean_html'
html_filters.push 'numpy/entries', 'numpy/clean_html', 'sphinx/clean_html'
# .main contains more than the page's content alone, but we need something
# that includes the navigation bar as well in order to guess the type of
@ -26,8 +26,13 @@ module Docs
Licensed under the NumPy License.
HTML
version '1.11' do
self.release = '1.11.0'
self.base_url = "https://docs.scipy.org/doc/numpy-#{self.release}/reference/"
end
version '1.10' do
self.release = '1.10.1'
self.release = '1.10.4'
self.base_url = "https://docs.scipy.org/doc/numpy-#{self.release}/reference/"
end
end

View file

@ -20,19 +20,19 @@ module Docs
HTML
version '3.5' do
self.release = '3.5.1'
self.release = '3.5.2'
self.dir = '/Users/Thibaut/DevDocs/Docs/Python35' # docs.python.org/3.5/download.html
self.base_url = 'https://docs.python.org/3.5/'
html_filters.push 'python/entries_v3', 'python/clean_html'
html_filters.push 'python/entries_v3', 'sphinx/clean_html', 'python/clean_html'
end
version '2.7' do
self.release = '2.7.10'
self.release = '2.7.12'
self.dir = '/Users/Thibaut/DevDocs/Docs/Python27' # docs.python.org/2.7/download.html
self.base_url = 'https://docs.python.org/2.7/'
html_filters.push 'python/entries_v2', 'python/clean_html'
html_filters.push 'python/entries_v2', 'sphinx/clean_html', 'python/clean_html'
end
end
end

View file

@ -0,0 +1,5 @@
module Docs
class Sphinx < Scraper
self.abstract = true
end
end