mirror of
https://github.com/freeCodeCamp/devdocs
synced 2024-11-16 19:48:10 +01:00
Update and improve Sphinx scrapers
This commit is contained in:
parent
1412517869
commit
562463b112
20 changed files with 141 additions and 308 deletions
|
@ -36,6 +36,7 @@ app.views.RethinkdbPage =
|
|||
app.views.RubydocPage =
|
||||
app.views.SinonPage =
|
||||
app.views.SocketioPage =
|
||||
app.views.SphinxPage =
|
||||
app.views.SphinxSimplePage =
|
||||
app.views.TensorflowPage =
|
||||
app.views.TypescriptPage =
|
||||
|
|
|
@ -1,8 +0,0 @@
|
|||
#= require views/pages/base
|
||||
|
||||
class app.views.SphinxPage extends app.views.BasePage
|
||||
prepare: ->
|
||||
@highlightCode @findAll('pre.python'), 'python'
|
||||
@highlightCode @findAll('pre.markup'), 'markup'
|
||||
@highlightCode @findAll('pre.php'), 'php'
|
||||
return
|
|
@ -2,15 +2,24 @@
|
|||
h2, h3 { @extend %block-heading; }
|
||||
h4 { font-size: 1em; }
|
||||
> dl:not(.docutils) > dt { @extend %block-label, %label-blue; }
|
||||
dl > dl > dt { @extend %block-label; }
|
||||
dd > dl:not(.docutils) > dt { @extend %block-label; }
|
||||
dt + dt { margin-top: -.5em; }
|
||||
|
||||
.note, .admonition, .versionadded, .versionchanged, .deprecated-removed { @extend %note; }
|
||||
.important { @extend %note-orange; }
|
||||
.warning, .deprecated-removed { @extend %note-red; }
|
||||
.versionmodified { font-weight: bold; }
|
||||
.note, .admonition, div.versionadded, div.versionchanged, .deprecated-removed, .deprecated { @extend %note; }
|
||||
|
||||
p > code, li > code, dd > code { @extend %label; }
|
||||
.important { @extend %note-orange; }
|
||||
.warning, .deprecated-removed, .deprecated { @extend %note-red; }
|
||||
|
||||
.versionmodified, span.title {
|
||||
display: block;
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
p > code, li > code, dd > code, .docutils > dt > code { @extend %label; }
|
||||
|
||||
ul.simple { margin: 1em 0; }
|
||||
|
||||
h2 > a, h3 > a, dt[id] > a.external { float: right; }
|
||||
|
||||
.admonition-title {
|
||||
float: left;
|
||||
|
@ -20,15 +29,13 @@
|
|||
&:after { content: ':'; }
|
||||
}
|
||||
|
||||
.admonition > dl {
|
||||
.admonition > dl, .admonition > ul {
|
||||
clear: left;
|
||||
margin: 0;
|
||||
}
|
||||
.admonition-title + dl { padding-top: .5em; }
|
||||
|
||||
ul.simple { margin: 1em 0; }
|
||||
|
||||
h2 > a, h3 > a, dt[id] > a.external { float: right; }
|
||||
td > div { margin: 0 !important; }
|
||||
}
|
||||
|
||||
._sphinx {
|
||||
|
|
|
@ -4,14 +4,6 @@ module Docs
|
|||
def call
|
||||
@doc = at_css('#page-content')
|
||||
|
||||
css('blockquote > div > pre:first-child:last-child', 'blockquote > div > ul:first-child:last-child').each do |node|
|
||||
node.ancestors('blockquote').first.before(node).remove
|
||||
end
|
||||
|
||||
css('a > em').each do |node|
|
||||
node.before(node.children).remove
|
||||
end
|
||||
|
||||
doc
|
||||
end
|
||||
end
|
||||
|
|
|
@ -2,8 +2,6 @@ module Docs
|
|||
class Cmake
|
||||
class CleanHtmlFilter < Filter
|
||||
def call
|
||||
css('.headerlink', '#contents .topic-title').remove
|
||||
|
||||
if root_page?
|
||||
css('#release-notes', '#index-and-search').remove
|
||||
|
||||
|
@ -12,38 +10,6 @@ module Docs
|
|||
end
|
||||
end
|
||||
|
||||
css('.contents > ul.simple > li:first-child:last-child').each do |node|
|
||||
node.parent.before(node.at_css('> ul'))
|
||||
node.remove
|
||||
end
|
||||
|
||||
css('.toc-backref', '.toctree-wrapper', '.contents', 'span.pre', 'pre a > code').each do |node|
|
||||
node.before(node.children).remove
|
||||
end
|
||||
|
||||
css('div[class*="highlight-"]').each do |node|
|
||||
pre = node.at_css('pre')
|
||||
pre.content = pre.content
|
||||
node.replace(pre)
|
||||
end
|
||||
|
||||
css('span[id]:empty').each do |node|
|
||||
node.next_element['id'] = node['id']
|
||||
node.remove
|
||||
end
|
||||
|
||||
css('.section').each do |node|
|
||||
if node['id']
|
||||
if node.first_element_child['id']
|
||||
node.element_children[1]['id'] = node['id']
|
||||
else
|
||||
node.first_element_child['id'] = node['id']
|
||||
end
|
||||
end
|
||||
|
||||
node.before(node.children).remove
|
||||
end
|
||||
|
||||
doc
|
||||
end
|
||||
end
|
||||
|
|
|
@ -1,31 +0,0 @@
|
|||
module Docs
|
||||
class Codeigniter
|
||||
class CleanHtmlFilter < Filter
|
||||
def call
|
||||
css('.headerlink').remove
|
||||
|
||||
css('h1', 'h2', 'h3', 'h4', 'h5', 'pre').each do |node|
|
||||
node.content = node.content
|
||||
end
|
||||
|
||||
css('div[class^="highlight-"]').each do |node|
|
||||
node.content = node.content.strip
|
||||
node.name = 'pre'
|
||||
node['class'] = 'php' if node['class'].include?('highlight-ci')
|
||||
end
|
||||
|
||||
css('table').each do |node|
|
||||
node.remove_attribute 'border'
|
||||
node.remove_attribute 'cellpadding'
|
||||
end
|
||||
|
||||
css('.section').each do |node|
|
||||
node.first_element_child['id'] = node['id'] if node['id']
|
||||
node.before(node.children).remove
|
||||
end
|
||||
|
||||
doc
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
|
@ -2,7 +2,9 @@ module Docs
|
|||
class Codeigniter
|
||||
class EntriesFilter < Docs::EntriesFilter
|
||||
def get_name
|
||||
at_css('h1').content.strip
|
||||
name = at_css('h1').content.strip
|
||||
name.remove! "\u{00B6}"
|
||||
name
|
||||
end
|
||||
|
||||
def get_type
|
||||
|
|
|
@ -4,43 +4,6 @@ module Docs
|
|||
def call
|
||||
@doc = at_css('.yui-g')
|
||||
|
||||
css('.section', 'a > em').each do |node|
|
||||
node.before(node.children).remove
|
||||
end
|
||||
|
||||
css('tt', 'span.pre').each do |node|
|
||||
node.name = 'code'
|
||||
node.content = node.content
|
||||
node.remove_attribute 'class'
|
||||
end
|
||||
|
||||
css('.headerlink').each do |node|
|
||||
id = node['href'][1..-1]
|
||||
node.parent['id'] ||= id
|
||||
doc.at_css("span##{id}").try(:remove)
|
||||
node.remove
|
||||
end
|
||||
|
||||
css('h1', 'h2', 'h3', 'dt').each do |node|
|
||||
links = node.css('a').remove
|
||||
node.content = node.content
|
||||
node << links
|
||||
end
|
||||
|
||||
css('div[class^="highlight-"]').each do |node|
|
||||
node.name = 'pre'
|
||||
node['class'] = node['data-language'] = case node['class']
|
||||
when 'highlight-python', 'highlight-default' then 'python'
|
||||
when 'highlight-html+django' then 'markup'
|
||||
else ''
|
||||
end
|
||||
node.content = node.at_css('pre').content
|
||||
end
|
||||
|
||||
css('code > code').each do |node|
|
||||
node.before(node.children).remove
|
||||
end
|
||||
|
||||
doc
|
||||
end
|
||||
end
|
||||
|
|
|
@ -1,67 +0,0 @@
|
|||
module Docs
|
||||
class Matplotlib
|
||||
class CleanHtmlFilter < Filter
|
||||
def call
|
||||
css('.headerlink', 'hr').remove
|
||||
|
||||
css('.toc-backref', '.toctree-wrapper', '.contents', 'span.pre', 'pre a > code').each do |node|
|
||||
node.before(node.children).remove
|
||||
end
|
||||
|
||||
css('div[class*="highlight-"]').each do |node|
|
||||
pre = node.at_css('pre')
|
||||
pre.content = pre.content
|
||||
pre['data-language'] = node['class'][/highlight\-(\w+)/, 1]
|
||||
node.replace(pre)
|
||||
end
|
||||
|
||||
css('span[id]:empty').each do |node|
|
||||
node.next_element['id'] = node['id']
|
||||
node.remove
|
||||
end
|
||||
|
||||
css('.section').each do |node|
|
||||
if node['id']
|
||||
if node.first_element_child['id']
|
||||
node.element_children[1]['id'] = node['id']
|
||||
else
|
||||
node.first_element_child['id'] = node['id']
|
||||
end
|
||||
end
|
||||
|
||||
node.before(node.children).remove
|
||||
end
|
||||
|
||||
css('h2 > a > code').each do |node|
|
||||
node.parent.before(node.content).remove
|
||||
end
|
||||
|
||||
css('dt[id]').each do |node|
|
||||
node.inner_html = "<code>#{node.content.strip}</code>"
|
||||
end
|
||||
|
||||
css('li > p:first-child:last-child').each do |node|
|
||||
node.before(node.children).remove
|
||||
end
|
||||
|
||||
css('table[border]').each do |node|
|
||||
node.remove_attribute 'border'
|
||||
end
|
||||
|
||||
css('code[class]').each do |node|
|
||||
node.remove_attribute 'class'
|
||||
end
|
||||
|
||||
css('h1').each do |node|
|
||||
node.content = node.content
|
||||
end
|
||||
|
||||
css('p.rubric').each do |node|
|
||||
node.name = 'h4'
|
||||
end
|
||||
|
||||
doc
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
|
@ -4,60 +4,6 @@ module Docs
|
|||
def call
|
||||
@doc = at_css('#spc-section-body')
|
||||
|
||||
css('colgroup').remove
|
||||
|
||||
css('.section', 'a > em', 'dt > tt', 'dt > em', 'dt > big', 'tbody').each do |node|
|
||||
node.before(node.children).remove
|
||||
end
|
||||
|
||||
css('.headerlink').each do |node|
|
||||
id = node['href'][1..-1]
|
||||
node.parent['id'] ||= id
|
||||
doc.at_css("span##{id}").try(:remove)
|
||||
node.remove
|
||||
end
|
||||
|
||||
css('tt', 'span.pre').each do |node|
|
||||
node.name = 'code'
|
||||
node.content = node.content
|
||||
node.remove_attribute 'class'
|
||||
end
|
||||
|
||||
css('h1', 'h2', 'h3').each do |node|
|
||||
node.content = node.content
|
||||
end
|
||||
|
||||
css('p.rubric').each do |node|
|
||||
node.name = 'h4'
|
||||
end
|
||||
|
||||
css('blockquote > div:first-child:last-child').each do |node|
|
||||
node.parent.before(node.parent.children).remove
|
||||
node.before(node.children).remove
|
||||
end
|
||||
|
||||
css('.admonition-example').each do |node|
|
||||
title = node.at_css('.admonition-title')
|
||||
title.name = 'h4'
|
||||
title.remove_attribute 'class'
|
||||
node.before(node.children).remove
|
||||
end
|
||||
|
||||
css('em.xref').each do |node|
|
||||
node.name = 'code'
|
||||
end
|
||||
|
||||
css('div[class*="highlight-"]').each do |node|
|
||||
node.content = node.content.strip
|
||||
node.name = 'pre'
|
||||
node['data-language'] = node['class'][/highlight\-(\w+)/, 1]
|
||||
node['class'] = node['data-language'] # tmp
|
||||
end
|
||||
|
||||
css('table[border]').each do |node|
|
||||
node.remove_attribute 'border'
|
||||
end
|
||||
|
||||
doc
|
||||
end
|
||||
end
|
||||
|
|
|
@ -4,21 +4,6 @@ module Docs
|
|||
def call
|
||||
@doc = at_css '.body'
|
||||
|
||||
css('> .section').each do |node|
|
||||
node.before(node.children).remove
|
||||
end
|
||||
|
||||
# Clean inline code elements
|
||||
|
||||
css('tt.literal').each do |node|
|
||||
node.before(node.children).remove
|
||||
end
|
||||
|
||||
css('tt', 'span.pre').each do |node|
|
||||
node.name = 'code'
|
||||
node.remove_attribute 'class'
|
||||
end
|
||||
|
||||
root_page? ? root : other
|
||||
|
||||
doc
|
||||
|
@ -26,14 +11,9 @@ module Docs
|
|||
|
||||
def root
|
||||
at_css('h1').content = 'Python'
|
||||
css('> p').remove
|
||||
end
|
||||
|
||||
def other
|
||||
css('.headerlink', 'hr').remove
|
||||
|
||||
# Clean headings
|
||||
|
||||
css('h1').each do |node|
|
||||
node.content = node.content.sub!(/\A[\d\.]+/) do |str|
|
||||
rgx = /\A#{str}/
|
||||
|
@ -43,32 +23,7 @@ module Docs
|
|||
end
|
||||
|
||||
css('h2', 'h3', 'h4').each do |node|
|
||||
node.css('a').each do |link|
|
||||
link.before(link.children).remove
|
||||
end
|
||||
node.child.content = node.child.content.remove @levelRegexp
|
||||
end
|
||||
|
||||
css('dt').each do |node|
|
||||
node.content = node.content
|
||||
end
|
||||
|
||||
# Remove blockquotes
|
||||
css('blockquote').each do |node|
|
||||
node.before(node.children).remove
|
||||
end
|
||||
|
||||
# Remove code highlighting
|
||||
css('[class*="highlight-python"]').each do |node|
|
||||
pre = node.at_css('pre')
|
||||
pre.content = pre.content
|
||||
pre['class'] = 'python'
|
||||
node.replace(pre)
|
||||
end
|
||||
|
||||
# Remove <table> border attribute
|
||||
css('table[border]').each do |node|
|
||||
node.remove_attribute 'border'
|
||||
node.inner_html = node.inner_html.remove @levelRegexp
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
97
lib/docs/filters/sphinx/clean_html.rb
Normal file
97
lib/docs/filters/sphinx/clean_html.rb
Normal file
|
@ -0,0 +1,97 @@
|
|||
module Docs
|
||||
class Sphinx
|
||||
class CleanHtmlFilter < Filter
|
||||
def call
|
||||
css('.headerlink', 'hr', '#contents .topic-title', '#topics .topic-title', 'colgroup').remove
|
||||
|
||||
css('.contents > ul:first-child:last-child.simple > li:first-child:last-child').each do |node|
|
||||
node.parent.before(node.at_css('> ul')) if node.at_css('> ul')
|
||||
node.remove
|
||||
end
|
||||
|
||||
css('em.xref', 'tt').each do |node|
|
||||
node.name = 'code'
|
||||
end
|
||||
|
||||
css('.toc-backref', '.toctree-wrapper', '.contents', 'span.pre', 'pre a > code', 'tbody', 'code > code', 'a > em').each do |node|
|
||||
node.before(node.children).remove
|
||||
end
|
||||
|
||||
css('div[class*="highlight-"]').each do |node|
|
||||
pre = node.at_css('pre')
|
||||
pre.content = pre.content
|
||||
pre['data-language'] = node['class'][/highlight\-(\w+)/, 1]
|
||||
pre['data-language'] = 'php' if pre['data-language'] == 'ci'
|
||||
pre['data-language'] = 'markup' if pre['data-language'] == 'html+django'
|
||||
pre['data-language'] = 'python' if pre['data-language'] == 'default' || pre['data-language'].start_with?('python')
|
||||
node.replace(pre)
|
||||
end
|
||||
|
||||
css('span[id]:empty').each do |node|
|
||||
(node.next_element || node.previous_element)['id'] ||= node['id'] if node.next_element || node.previous_element
|
||||
node.remove
|
||||
end
|
||||
|
||||
css('.section').each do |node|
|
||||
if node['id']
|
||||
if node.first_element_child['id']
|
||||
node.element_children[1]['id'] = node['id'] if node.element_children[1]
|
||||
else
|
||||
node.first_element_child['id'] = node['id']
|
||||
end
|
||||
end
|
||||
|
||||
node.before(node.children).remove
|
||||
end
|
||||
|
||||
css('h2 > a > code').each do |node|
|
||||
node.parent.before(node.content).remove
|
||||
end
|
||||
|
||||
css('dt').each do |node|
|
||||
next unless node['id'] || node.at_css('code')
|
||||
links = []
|
||||
links << node.children.last.remove while node.children.last.try(:name) == 'a'
|
||||
node.inner_html = "<code>#{node.content.strip}</code> "
|
||||
links.reverse_each { |link| node << link }
|
||||
end
|
||||
|
||||
css('li > p:first-child:last-child').each do |node|
|
||||
node.before(node.children).remove
|
||||
end
|
||||
|
||||
css('blockquote > div:first-child:last-child').each do |node|
|
||||
node.parent.before(node.parent.children).remove
|
||||
node.before(node.children).remove
|
||||
end
|
||||
|
||||
css('.admonition-example').each do |node|
|
||||
title = node.at_css('.admonition-title')
|
||||
title.name = 'h4'
|
||||
title.remove_attribute 'class'
|
||||
node.before(node.children).remove
|
||||
end
|
||||
|
||||
css('table[border]').each do |node|
|
||||
node.remove_attribute 'border'
|
||||
node.remove_attribute 'cellpadding'
|
||||
node.remove_attribute 'cellspacing'
|
||||
end
|
||||
|
||||
css('code[class]').each do |node|
|
||||
node.remove_attribute 'class'
|
||||
end
|
||||
|
||||
css('h1').each do |node|
|
||||
node.content = node.content
|
||||
end
|
||||
|
||||
css('p.rubric').each do |node|
|
||||
node.name = 'h4'
|
||||
end
|
||||
|
||||
doc
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
|
@ -9,7 +9,7 @@ module Docs
|
|||
code: 'https://github.com/ansible/ansible'
|
||||
}
|
||||
|
||||
html_filters.push 'ansible/entries', 'ansible/clean_html', 'codeigniter/clean_html'
|
||||
html_filters.push 'ansible/entries', 'ansible/clean_html', 'sphinx/clean_html'
|
||||
|
||||
options[:skip] = %w(
|
||||
glossary.html
|
||||
|
|
|
@ -7,7 +7,7 @@ module Docs
|
|||
code: 'https://cmake.org/gitweb?p=cmake.git;a=summary'
|
||||
}
|
||||
|
||||
html_filters.push 'cmake/clean_html', 'cmake/entries', 'title'
|
||||
html_filters.push 'cmake/clean_html', 'sphinx/clean_html', 'cmake/entries', 'title'
|
||||
|
||||
options[:container] = '.body'
|
||||
options[:title] = false
|
||||
|
|
|
@ -9,7 +9,7 @@ module Docs
|
|||
code: 'https://github.com/bcit-ci/CodeIgniter'
|
||||
}
|
||||
|
||||
html_filters.push 'codeigniter/clean_html', 'codeigniter/entries'
|
||||
html_filters.push 'codeigniter/entries', 'sphinx/clean_html'
|
||||
|
||||
options[:container] = '.document'
|
||||
|
||||
|
@ -36,7 +36,7 @@ module Docs
|
|||
HTML
|
||||
|
||||
version '3.0' do
|
||||
self.release = '3.0.4'
|
||||
self.release = '3.0.6'
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
|
@ -8,7 +8,7 @@ module Docs
|
|||
code: 'https://github.com/django/django'
|
||||
}
|
||||
|
||||
html_filters.push 'django/entries', 'django/clean_html'
|
||||
html_filters.push 'django/entries', 'sphinx/clean_html', 'django/clean_html'
|
||||
text_filters.push 'django/fix_urls'
|
||||
|
||||
options[:container] = '#bd'
|
||||
|
|
|
@ -8,7 +8,7 @@ module Docs
|
|||
code: 'https://github.com/matplotlib/matplotlib'
|
||||
}
|
||||
|
||||
html_filters.push 'matplotlib/entries', 'matplotlib/clean_html'
|
||||
html_filters.push 'matplotlib/entries', 'sphinx/clean_html'
|
||||
|
||||
options[:container] = '.body'
|
||||
options[:skip] = %w(api_changes.html)
|
||||
|
|
|
@ -9,7 +9,7 @@ module Docs
|
|||
code: 'https://github.com/numpy/numpy'
|
||||
}
|
||||
|
||||
html_filters.push 'numpy/entries', 'numpy/clean_html'
|
||||
html_filters.push 'numpy/entries', 'numpy/clean_html', 'sphinx/clean_html'
|
||||
|
||||
# .main contains more than the page's content alone, but we need something
|
||||
# that includes the navigation bar as well in order to guess the type of
|
||||
|
@ -26,8 +26,13 @@ module Docs
|
|||
Licensed under the NumPy License.
|
||||
HTML
|
||||
|
||||
version '1.11' do
|
||||
self.release = '1.11.0'
|
||||
self.base_url = "https://docs.scipy.org/doc/numpy-#{self.release}/reference/"
|
||||
end
|
||||
|
||||
version '1.10' do
|
||||
self.release = '1.10.1'
|
||||
self.release = '1.10.4'
|
||||
self.base_url = "https://docs.scipy.org/doc/numpy-#{self.release}/reference/"
|
||||
end
|
||||
end
|
||||
|
|
|
@ -20,19 +20,19 @@ module Docs
|
|||
HTML
|
||||
|
||||
version '3.5' do
|
||||
self.release = '3.5.1'
|
||||
self.release = '3.5.2'
|
||||
self.dir = '/Users/Thibaut/DevDocs/Docs/Python35' # docs.python.org/3.5/download.html
|
||||
self.base_url = 'https://docs.python.org/3.5/'
|
||||
|
||||
html_filters.push 'python/entries_v3', 'python/clean_html'
|
||||
html_filters.push 'python/entries_v3', 'sphinx/clean_html', 'python/clean_html'
|
||||
end
|
||||
|
||||
version '2.7' do
|
||||
self.release = '2.7.10'
|
||||
self.release = '2.7.12'
|
||||
self.dir = '/Users/Thibaut/DevDocs/Docs/Python27' # docs.python.org/2.7/download.html
|
||||
self.base_url = 'https://docs.python.org/2.7/'
|
||||
|
||||
html_filters.push 'python/entries_v2', 'python/clean_html'
|
||||
html_filters.push 'python/entries_v2', 'sphinx/clean_html', 'python/clean_html'
|
||||
end
|
||||
end
|
||||
end
|
||||
|
|
5
lib/docs/scrapers/sphinx.rb
Normal file
5
lib/docs/scrapers/sphinx.rb
Normal file
|
@ -0,0 +1,5 @@
|
|||
module Docs
|
||||
class Sphinx < Scraper
|
||||
self.abstract = true
|
||||
end
|
||||
end
|
Loading…
Reference in a new issue