basic scrapping working

This commit is contained in:
Mathieu PATUREL 2019-04-26 20:04:13 +10:00
parent 94803a4fa9
commit cda737ceec
5 changed files with 136 additions and 1 deletions

1
.gitignore vendored
View file

@ -6,3 +6,4 @@ public/fonts
public/docs/**/*
docs/**/*
!docs/*.md
vendor

View file

@ -0,0 +1,21 @@
module Docs
class Trio
class CleanHtmlFilter < Filter
def call
@doc = at_css('div[role="main"]')
css('.section, [itemprop=articleBody]').each do |node|
node.replace node.children
end
css('.headerlink').remove
css('dt').each do |node|
new_node = doc.document.create_element "h3"
new_node.content = node.inner_text[0...-1]
node.replace new_node
end
doc
end
end
end
end

View file

@ -0,0 +1,21 @@
module Docs
class Trio
class EntriesFilter < Docs::EntriesFilter
def get_name
at_css('h1').text[0...-1]
end
def get_type
at_css('h1').text[0...-1]
end
def additional_entries
css('.descname').each_with_object [] do |node, entries|
name = node.previous.text + node.text
id = node.parent['id']
entries << [name, id]
end
end
end
end
end

24
lib/docs/scrapers/trio.rb Normal file
View file

@ -0,0 +1,24 @@
module Docs
class Trio < UrlScraper
self.type = 'simple'
self.release = '0.11'
self.base_url = 'https://trio.readthedocs.io/en/latest/'
self.root_path = 'index.html'
self.links = {
home: 'https://trio.readthedocs.io/',
code: 'https://github.com/python-trio/trio'
}
html_filters.push 'trio/entries', 'trio/clean_html'
options[:attribution] = <<-HTML
HTML
options[:only_patterns] = [
/reference-core/,
/reference-io/,
/reference-testing/,
/reference-hazmat/,
]
end
end

View file

@ -1 +1,69 @@
[]
[
{
"name": "Chef",
"slug": "chef~12",
"type": "sphinx_simple",
"links": {
"home": "https://www.chef.io/",
"code": "https://github.com/chef/chef"
},
"version": "12",
"release": "12.13",
"mtime": 1556264506,
"db_size": 7170006
},
{
"name": "CSS",
"slug": "css",
"type": "mdn",
"mtime": 1543099045,
"db_size": 12415944
},
{
"name": "DOM",
"slug": "dom",
"type": "mdn",
"mtime": 1543157862,
"db_size": 33998524
},
{
"name": "DOM Events",
"slug": "dom_events",
"type": "mdn",
"mtime": 1543099589,
"db_size": 1752500
},
{
"name": "HTML",
"slug": "html",
"type": "mdn",
"mtime": 1543097764,
"db_size": 4141596
},
{
"name": "HTTP",
"slug": "http",
"type": "mdn",
"mtime": 1543099392,
"db_size": 4731727
},
{
"name": "JavaScript",
"slug": "javascript",
"type": "mdn",
"mtime": 1543098529,
"db_size": 6462141
},
{
"name": "Trio",
"slug": "trio",
"type": "simple",
"links": {
"home": "https://trio.readthedocs.io/",
"code": "https://github.com/python-trio/trio"
},
"release": "0.11",
"mtime": 1556272773,
"db_size": 736670
}
]