mirror of
https://github.com/freeCodeCamp/devdocs
synced 2024-11-16 19:48:10 +01:00
basic scrapping working
This commit is contained in:
parent
94803a4fa9
commit
cda737ceec
5 changed files with 136 additions and 1 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
@ -6,3 +6,4 @@ public/fonts
|
|||
public/docs/**/*
|
||||
docs/**/*
|
||||
!docs/*.md
|
||||
vendor
|
||||
|
|
21
lib/docs/filters/trio/clean_html.rb
Normal file
21
lib/docs/filters/trio/clean_html.rb
Normal file
|
@ -0,0 +1,21 @@
|
|||
module Docs
|
||||
class Trio
|
||||
class CleanHtmlFilter < Filter
|
||||
def call
|
||||
@doc = at_css('div[role="main"]')
|
||||
css('.section, [itemprop=articleBody]').each do |node|
|
||||
node.replace node.children
|
||||
end
|
||||
|
||||
css('.headerlink').remove
|
||||
|
||||
css('dt').each do |node|
|
||||
new_node = doc.document.create_element "h3"
|
||||
new_node.content = node.inner_text[0...-1]
|
||||
node.replace new_node
|
||||
end
|
||||
doc
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
21
lib/docs/filters/trio/entries.rb
Normal file
21
lib/docs/filters/trio/entries.rb
Normal file
|
@ -0,0 +1,21 @@
|
|||
module Docs
|
||||
class Trio
|
||||
class EntriesFilter < Docs::EntriesFilter
|
||||
def get_name
|
||||
at_css('h1').text[0...-1]
|
||||
end
|
||||
|
||||
def get_type
|
||||
at_css('h1').text[0...-1]
|
||||
end
|
||||
|
||||
def additional_entries
|
||||
css('.descname').each_with_object [] do |node, entries|
|
||||
name = node.previous.text + node.text
|
||||
id = node.parent['id']
|
||||
entries << [name, id]
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
||||
end
|
24
lib/docs/scrapers/trio.rb
Normal file
24
lib/docs/scrapers/trio.rb
Normal file
|
@ -0,0 +1,24 @@
|
|||
module Docs
|
||||
class Trio < UrlScraper
|
||||
self.type = 'simple'
|
||||
self.release = '0.11'
|
||||
self.base_url = 'https://trio.readthedocs.io/en/latest/'
|
||||
self.root_path = 'index.html'
|
||||
self.links = {
|
||||
home: 'https://trio.readthedocs.io/',
|
||||
code: 'https://github.com/python-trio/trio'
|
||||
}
|
||||
|
||||
html_filters.push 'trio/entries', 'trio/clean_html'
|
||||
|
||||
options[:attribution] = <<-HTML
|
||||
HTML
|
||||
options[:only_patterns] = [
|
||||
/reference-core/,
|
||||
/reference-io/,
|
||||
/reference-testing/,
|
||||
/reference-hazmat/,
|
||||
]
|
||||
|
||||
end
|
||||
end
|
|
@ -1 +1,69 @@
|
|||
[]
|
||||
[
|
||||
{
|
||||
"name": "Chef",
|
||||
"slug": "chef~12",
|
||||
"type": "sphinx_simple",
|
||||
"links": {
|
||||
"home": "https://www.chef.io/",
|
||||
"code": "https://github.com/chef/chef"
|
||||
},
|
||||
"version": "12",
|
||||
"release": "12.13",
|
||||
"mtime": 1556264506,
|
||||
"db_size": 7170006
|
||||
},
|
||||
{
|
||||
"name": "CSS",
|
||||
"slug": "css",
|
||||
"type": "mdn",
|
||||
"mtime": 1543099045,
|
||||
"db_size": 12415944
|
||||
},
|
||||
{
|
||||
"name": "DOM",
|
||||
"slug": "dom",
|
||||
"type": "mdn",
|
||||
"mtime": 1543157862,
|
||||
"db_size": 33998524
|
||||
},
|
||||
{
|
||||
"name": "DOM Events",
|
||||
"slug": "dom_events",
|
||||
"type": "mdn",
|
||||
"mtime": 1543099589,
|
||||
"db_size": 1752500
|
||||
},
|
||||
{
|
||||
"name": "HTML",
|
||||
"slug": "html",
|
||||
"type": "mdn",
|
||||
"mtime": 1543097764,
|
||||
"db_size": 4141596
|
||||
},
|
||||
{
|
||||
"name": "HTTP",
|
||||
"slug": "http",
|
||||
"type": "mdn",
|
||||
"mtime": 1543099392,
|
||||
"db_size": 4731727
|
||||
},
|
||||
{
|
||||
"name": "JavaScript",
|
||||
"slug": "javascript",
|
||||
"type": "mdn",
|
||||
"mtime": 1543098529,
|
||||
"db_size": 6462141
|
||||
},
|
||||
{
|
||||
"name": "Trio",
|
||||
"slug": "trio",
|
||||
"type": "simple",
|
||||
"links": {
|
||||
"home": "https://trio.readthedocs.io/",
|
||||
"code": "https://github.com/python-trio/trio"
|
||||
},
|
||||
"release": "0.11",
|
||||
"mtime": 1556272773,
|
||||
"db_size": 736670
|
||||
}
|
||||
]
|
Loading…
Reference in a new issue