Finish LOVE scraper

This commit is contained in:
Thibaut Courouble 2016-06-19 16:44:09 -04:00
parent a25290de80
commit e3582e267e
12 changed files with 130 additions and 225 deletions

Binary file not shown.

Before

Width:  |  Height:  |  Size: 46 KiB

After

Width:  |  Height:  |  Size: 47 KiB

Binary file not shown.

Before

Width:  |  Height:  |  Size: 120 KiB

After

Width:  |  Height:  |  Size: 122 KiB

View file

@ -1,10 +1,13 @@
[
[
"2016-06-19",
"New documentation: <a href=\"/love/\">L&Ouml;VE</a>"
], [
"2016-06-12",
"New documentation: <a href=\"/angular/\">Angular 2</a>"
], [
"2016-06-05",
"New documentation: <a href=\"/kotlin/\">Kotlin</a> and <a href=\"/padrino/\">Padrino</a>"
"New documentations: <a href=\"/kotlin/\">Kotlin</a> and <a href=\"/padrino/\">Padrino</a>"
], [
"2016-04-24",
"New documentations: <a href=\"/numpy/\">NumPy</a> and <a href=\"/apache_pig/\">Apache Pig</a>"

View file

@ -275,6 +275,11 @@ credits = [
'19942015 Lua.org, PUC-Rio',
'MIT',
'http://www.lua.org/license.html'
], [
'L&Ouml;VE',
'2006-2016 L&Ouml;VE Development Team',
'GFDL',
'http://www.gnu.org/copyleft/fdl.html'
], [
'Marionette.js',
'2016 Muted Solutions, LLC',

View file

@ -142,3 +142,4 @@
._icon-kotlin:before { background-position: -6rem -11rem; }
._icon-padrino:before { background-position: -7rem -11rem; }
._icon-angular:before { background-position: -8rem -11rem; }
._icon-love:before { background-position: -9rem -11rem; }

View file

@ -1,67 +1,17 @@
._love {
padding-left: 1rem;
@extend %simple;
h1, h2 { margin-left: -1rem; }
h2 { @extend %block-heading; }
h3 { margin-left: -0.5rem; @extend %block-label; }
._mobile & {
padding-left: 0;
h1, h2, h3 { margin-left: 0; }
}
p > code, li > code { @extend %label; }
blockquote { @extend %note; }
.box { @extend %box; }
.note { @extend %note; }
.label { @extend %label; }
.note-green { @extend %note-green; }
.note-red { @extend %note-red; }
.box-heading {
@extend %heading-box;
padding: .5em .75em;
margin-top: 1.5rem;
margin-bottom: 0px;
border-bottom: none;
border-bottom-left-radius: 0px;
border-bottom-right-radius: 0px;
}
.box-with-heading {
@extend %box;
padding: .5em .75em;
margin-top: 0px;
margin-bottom: 1.5rem;
border-top-left-radius: 0px;
border-top-right-radius: 0px;
}
.label, dt > code { @extend %label; }
.label-green { @extend %label-green; }
.label-red { @extend %label-red; }
.smwtable { width: 100%; }
.smwtable td:nth-last-child(2), .smwtable td:last-child { width: 2.5em; }
.note-green, .label-green { @extend %note-green; }
.note-red, .label-red { @extend %note-red; }
.note-orange, .label-orange { @extend %note-orange; }
.cell-green { background: $noteGreenBackground; }
.cell-red { background: $noteRedBackground; }
.smwtable {
width: 100%;
tr {
td {
word-wrap: break-word;
}
td:first-child, td:nth-last-child(2), td:last-child {
vertical-align: middle;
white-space: nowrap;
width: 1em;
overflow: hidden;
}
}
}
hr {
border: none;
height: 1px;
background-color: $textColorLighter;
margin: 1.5em 0 1em;
}
}

View file

@ -39,6 +39,7 @@ module Docs
def to_internal_url(str)
return unless (url = parse_url(str)) && (subpath = subpath_to(url))
normalize_subpath(subpath)
subpath = URI.unescape(subpath) if context[:decode_and_clean_paths]
return if skip_subpath?(subpath)
normalize_url(url, subpath)
url

View file

@ -2,100 +2,67 @@ module Docs
class Love
class CleanHtmlFilter < Filter
def call
# Fix syntax highlighting
@doc = at_css('#mw-content-text')
css('.mw-code').each do |node|
node.content = node.at_css("div > pre").content
node.content = node.at_css('div > pre').content
node['data-language'] = 'lua'
node.name = 'pre'
end
# Move header tags up
css('h2', 'h3').each do |node|
headline = node.at_css('.mw-headline')
node['id'] = headline['id']
node.content = headline.inner_text
css('span[id]').each do |node|
node.parent['id'] = node['id']
node.before(node.children).remove
end
# Move dt tags up
css('dt > span').each do |node|
node.parent.content = node.inner_text
css('table.notice').each do |node|
content = node.at_css('td:nth-child(2)').inner_html
node.replace %(<p class="note">#{content}</p>)
end
# Style notices and new/removed sections
css('.notice', '.new-section', '.removed-section', '.removed-new-section').each do |node|
case node['class']
when 'notice'
node['class'] = 'note note-warning'
node.inner_html = node.at_css('td:nth-child(2)').inner_html
node.next.remove unless node.next.nil? or node.next.name != 'br'
when 'new-section', 'removed-section', 'removed-new-section'
node['class'] = node['class'] == 'new-section' ? 'note note-green' : 'note note-red'
node.inner_html = node.at_css('tr > td > i').inner_html \
+ '<br>' \
+ node.at_css('tr > td > small').inner_html
end
node.name = 'p'
node.remove_attribute('bgcolor')
node.remove_attribute('style')
node.remove_attribute('align')
css('table.new-section', 'table.removed-section', 'table.removed-new-section').each do |node|
klass = node['class'] == 'new-section' ? 'note-green' : 'note-red'
content = node.css('td').map(&:inner_html).join('<br>')
node.replace %(<p class="note #{klass}">#{content}</p>)
end
# Style new/removed features
css('.new-feature', '.removed-feature', '.removed-new-feature').each do |node|
node.name = 'div'
node['class'] = node['class'] == 'new-feature' ? 'box-heading label-green' : 'box-heading label-red'
node.remove_attribute('style')
klass = node['class'] == 'new-feature' ? 'label-green' : 'label-red'
content = node.content.sub(' LÖVE', '')
label = %( <span class="label #{klass}">#{content}</span>)
container = node.next_element
container.name = 'div'
container['class'] = 'box-with-heading'
container.remove_attribute('style')
end
# Style tables
css('table.smwtable').each do |table|
table.remove_attribute('style')
table.css('td').each do |cell|
cell.remove_attribute('style')
end
table.css('td:last-child', 'td:nth-last-child(2)').each do |cell|
img = cell.at_css('img')
if img then
if img['alt'] == 'Added since' then
cell['class'] = 'cell-green'
elsif img['alt'] == 'Removed in'
cell['class'] = 'cell-red'
end
img.remove
end
end
end
# Remove Other Languages
css('#Other_Languages').remove
css('.i18n').remove
# Remove changelog
node = at_css('h2#Changelog')
if !node.nil? then
begin
nxt = node.next
node.remove
node = nxt
end while !node.nil? and node.name != 'h2'
end
# Remove empty paragraphs
css('p').each do |node|
node.remove if node.inner_text.strip == ''
end
# Remove linebreaks that are the first or last child of a paragraph
css('p > br:first-child', 'p > br:last-child').each do |node|
node.next_element.css('dt').each { |n| n << label }
node.remove
end
css('img[src$="Add.png"]').each do |node|
node.parent['class'] = 'cell-green'
node.remove
end
css('img[src$="Remove.png"]').each do |node|
node.parent['class'] = 'cell-red'
node.remove
end
css('table, tr, td, th').each do |node|
%w(style cellpadding cellspacing width height valign).each do |attribute|
node.remove_attribute(attribute)
end
end
css('.note i', '.note small', 'div:not([class])', '.smwtable td:nth-last-child(2) > a', '.smwtable td:last-child > a').each do |node|
node.before(node.children).remove
end
css('p > br').each do |node|
node.parent.remove if node.parent.content.empty?
end
css('div > br', '> br', 'hr').remove
css('#Editing_the_wiki + p', '#Editing_the_wiki').remove
css('#Other_Languages', '.i18n').remove
doc
end
end

View file

@ -1,21 +1,55 @@
module Docs
class Love
class EntriesFilter < Docs::EntriesFilter
def get_type
if TYPE_OVERRIDE.key?(slug) then
return TYPE_OVERRIDE[slug]
elsif m = slug.match(/\A(love\.\w+)\z/) then
# modules and funcions
return LOVE_MODULES.include?(m[1]) ? m[1] : 'love'
elsif m = slug.match(/\A(love\.\w+)\.(\w+)/) then
# functions in modules
return m[1]
elsif context[:list_classes] and (m = slug.match(/\A\(?([A-Z]\w+)\)?(\:\w+)?/)) then
# classes, their members and enums
return m[1] unless m[1].include?('_')
TYPES = {
'require' => 'Lua',
'light_userdata' => 'Lua',
'value' => 'Lua',
'variable' => 'Lua',
'Audio_Formats' => 'love.sound',
'ImageFontFormat' => 'love.font',
'BlendMode_Formulas' => 'love.graphics',
'Shader_Variables' => 'love.graphics',
'AreaSpreadDistribution' => 'love.graphics',
'BodyType' => 'love.physics',
'BufferMode' => 'love.filesystem',
'CompressedFormat' => 'love.image',
'JoystickConstant' => 'love.joystick',
'ParticleInsertMode' => 'love.graphics',
'String' => 'love',
'TextureMode' => 'love.graphics'
}
def call
if context[:initial_paths].include?(slug)
css('table.smwtable td:first-child > a').each do |node|
TYPES[node.content.strip] = slug
end
end
super
end
def get_type
if slug == 'love'
'love'
elsif slug.start_with?('enet')
'enet'
elsif slug.include?('Joint') || slug.include?('Shape')
'love.physics'
elsif TYPES.key?(slug)
TYPES[slug]
elsif match = slug.match(/\A(love\.\w+)(\.\w+)?\z/)
match[2] || context[:initial_paths].include?(match[1]) ? match[1] : 'love'
elsif at_css('#catlinks a[title="Category:Lua"]')
'Lua'
elsif
type = slug.split(':').first
type.remove! %r{[\(\)]}
TYPES[type]
end
# usually this shouldn't happen
"Other"
end
end
end

View file

@ -1,98 +1,42 @@
module Docs
class Love < UrlScraper
LOVE_MODULES = %w(
love
love.audio
love.event
love.filesystem
love.font
love.graphics
love.image
love.joystick
love.keyboard
love.math
love.mouse
love.physics
love.sound
love.system
love.thread
love.timer
love.touch
love.video
love.window
)
TYPE_OVERRIDE = {
"Audio_Formats" => "love.sound",
"ImageFontFormat" => "love.font",
"BlendMode_Formulas" => "BlendMode",
"Shader_Variables" => "Shader"
}
self.name = 'LÖVE'
self.slug = 'love'
self.type = 'love'
self.release = '0.10.1'
self.base_url = 'https://love2d.org/wiki/'
self.root_path = 'love'
self.initial_paths = LOVE_MODULES
self.root_path = 'Main_Page'
self.initial_paths = %w(love love.audio love.event love.filesystem love.font love.graphics
love.image love.joystick love.keyboard love.math love.mouse love.physics love.sound
love.system love.thread love.timer love.touch love.video love.window enet socket utf8)
self.links = {
home: 'https://love2d.org/',
code: 'https://bitbucket.org/rude/love'
}
html_filters.push 'love/clean_html', 'love/entries', 'title'
options[:root_title] = 'love'
options[:initial_paths] = LOVE_MODULES
html_filters.push 'love/entries', 'love/clean_html', 'title'
options[:root_title] = 'LÖVE'
options[:decode_and_clean_paths] = true
options[:container] = '#bodyContent'
# Add types to classes and their members
options[:list_classes] = true
options[:skip] = %w(Getting_Started Building_LÖVE Tutorial Tutorials Game_Distribution License
Games Libraries Software Snippets Version_History Lovers PO2_Syndrome HSL_color Guidelines)
options[:container] = '#mw-content-text'
options[:only_patterns] = [
/\A(love\z|love\.|[A-Z]|\([A-Z])/
# love
# love.* (modules and functions)
# Uppercased (classes and enums)
# (Uppercased) (generalized classes)
]
options[:skip] = %w(
Getting_Started
Building_LÖVE
Tutorial
Tutorials
Game_Distribution
License
Games
Libraries
Software
Snippets
Version_History
Lovers
PO2_Syndrome
HSL_color
)
options[:skip_patterns] = [
/_\([^\)]+\)\z/,
# anything_(language) (this might have to be tweaked)
/\ASpecial:/,
/\ACategory:/,
/\AFile:/,
/\AHelp:/,
/\ATemplate:/,
/\AUser:/,
/\ATutorial:/
# special pages are indistinguishable from instance methods
/_\([^\)]+\)\z/, # anything_(language)
/\A(Special|Category|File|Help|Template|User|Tutorial):/,
/\A\d/
]
options[:replace_paths] = {
"Config_Files" => "love.conf"
'Config_Files' => 'love.conf',
'conf.lua' => 'love.conf',
'lua-enet' => 'enet'
}
options[:attribution] = <<-HTML
&copy; L&Ouml;VE Development Team<br>
&copy; 2006&ndash;2016 L&Ouml;VE Development Team<br>
Licensed under the GNU Free Documentation License, Version 1.3.
HTML
end

Binary file not shown.

Before

Width:  |  Height:  |  Size: 1.4 KiB

After

Width:  |  Height:  |  Size: 919 B

Binary file not shown.

Before

Width:  |  Height:  |  Size: 2.6 KiB

After

Width:  |  Height:  |  Size: 1.9 KiB