Modify aegisub_convert_docs.pl to produce a static mirror suitable for serving on a web site

This commit is contained in:
Thomas Goyne 2012-08-31 12:51:00 -07:00
parent 1c9af767e5
commit 598d3d77e6
1 changed files with 107 additions and 96 deletions

203
docs/wiki_convert/aegisub_convert_docs.pl Normal file → Executable file
View File

@ -30,12 +30,13 @@ use File::Path;
# variables # variables
my $base_dir = '.'; my $base_dir = '.';
my $host = 'http://aegisub.cellosoft.com'; my $host = 'http://docs.aegisub.org';
my $docs_base_url = 'http://aegisub.cellosoft.com/docs/'; my $docs_base_url = 'http://docs.aegisub.org/manual/';
my $agent = LWP::UserAgent->new(); my $agent = LWP::UserAgent->new();
my @process_pages = ($docs_base_url . 'Main_Page'); # start at the main page my @process_pages = ($docs_base_url . 'Main_Page'); # start at the main page
my %pages = ($base_dir . '/Main_Page.html' => $docs_base_url . "Main_Page"); my %pages = ($base_dir . '/Main_Page' => $docs_base_url . "Main_Page");
my %accepted_types = ( my %accepted_types = (
'application/javascript' => '.js',
'image/png' => '.png', 'image/png' => '.png',
'image/jpeg' => '.jpg', 'image/jpeg' => '.jpg',
'image/gif' => '.gif', 'image/gif' => '.gif',
@ -64,42 +65,40 @@ while ( @process_pages ) {
# initialize object and download the page # initialize object and download the page
my $page_object = $agent->get($current_page); my $page_object = $agent->get($current_page);
print("Requesting ${current_page}...\n"); print("Requesting ${current_page}...\n");
# warn and skip if we couldn't get it # warn and skip if we couldn't get it
unless ( $page_object->is_success() ) { unless ( $page_object->is_success() ) {
warn("Couldn't get ", $current_page, ": ", $page_object->status_line(), ", skipping\n"); warn("Couldn't get ", $current_page, ": ", $page_object->status_line(), ", skipping\n");
next(GETLOOP); next(GETLOOP);
} }
my $content_type = $page_object->content_type(); my $content_type = $page_object->content_type();
# skip if we don't know what it is # skip if we don't know what it is
unless ( exists($accepted_types{$content_type}) ) { unless ( exists($accepted_types{$content_type}) ) {
warn("I don't know what to do with ", $content_type, ", skipping\n"); warn("I don't know what to do with ", $content_type, ", skipping\n");
next(GETLOOP); next(GETLOOP);
} }
# monstrously ugly hack to handle rewriting of .css filenames # monstrously ugly hack to handle rewriting of .css filenames
my $name; my $name;
if ( $current_page =~ m!index\.php\?title=MediaWiki\:.+?\.css!i ) { if ( $current_page =~ m!css$!i ) {
$name = convert_css_link($current_page); $name = convert_css_link($current_page);
} else { } else {
$name = convert_link($current_page); $name = convert_link($current_page);
} }
# if it's html, parse it, grab new links # if it's html, parse it, grab new links
# add them to @process_pages if they're not already in there # add them to @process_pages if they're not already in there
# then write the modified html to disk # then write the modified html to disk
if ( $content_type eq 'text/html' ) { if ( $content_type eq 'text/html' ) {
my ($filename, $content, @newpages) = parse_and_fix_html($current_page, $page_object->base(), $page_object->decoded_content()); my ($filename, $content, @newpages) = parse_and_fix_html($current_page, $page_object->base(), $page_object->decoded_content());
# skip this page if the parser decided it was a page we don't want # skip this page if the parser decided it was a page we don't want
next(GETLOOP) unless ($content); next(GETLOOP) unless ($content);
write_to_disk($filename, $content_type, $content); write_to_disk($filename, $content_type, $content);
foreach my $url (@newpages) { foreach my $url (@newpages) {
my $newname = convert_link($url); my $newname = convert_link($url);
# check if we already added that page to our todo-list # check if we already added that page to our todo-list
@ -116,9 +115,9 @@ while ( @process_pages ) {
# if it's CSS we need the @import'ed links # if it's CSS we need the @import'ed links
elsif ( $content_type eq 'text/css' ) { elsif ( $content_type eq 'text/css' ) {
my @newpages = parse_css($current_page, $page_object->base(), $page_object->decoded_content()); my @newpages = parse_css($current_page, $page_object->base(), $page_object->decoded_content());
write_to_disk($name, $content_type, $page_object->decoded_content()); write_to_disk($name, $content_type, $page_object->decoded_content());
foreach my $url (@newpages) { foreach my $url (@newpages) {
my $newname = convert_link($url); my $newname = convert_link($url);
# check if we already added that page to our todo-list # check if we already added that page to our todo-list
@ -158,30 +157,42 @@ sub parse_and_fix_html {
#get arguments #get arguments
my ($url, $base, $content) = @_; my ($url, $base, $content) = @_;
my (@links, @links_to_modify); # list of links to return later my (@links, @links_to_modify); # list of links to return later
# strip RSS etc.
$content =~ s!<link rel=[^>]*xml[^>]* />!!gi;
# kill the topbar
$content =~ s!<div id=\"topbar\".*?<\!-- end topbar -->!!s;
# kill the article/discussion/history thing
$content =~ s!<div id=\"p-cactions\".*?</div>!!s;
# kill the "toolbox" at the bottom left
$content =~ s!<div class=\"portlet\" id=\"p-tb\".*?(<\!-- end of the left)!$1!s;
# kill "recent changes"
$content =~ s!<li id=\"n-recentchanges\">.*?</li>!!;
# parse HTML # parse HTML
my $html = HTML::LinkExtor->new(); my $html = HTML::LinkExtor->new();
# $html->utf8_mode(1); # not needed
$html->parse($content); $html->parse($content);
$html->eof(); $html->eof();
# loop over the list of links # loop over the list of links
# all real work is done here # all real work is done here
LINKLIST: LINKLIST:
foreach my $tag ( $html->links() ) { foreach my $tag ( $html->links() ) {
my ($tagname, %attrs) = @{$tag}; my ($tagname, %attrs) = @{$tag};
my $quoted = quotemeta($docs_base_url); my $quoted = quotemeta($docs_base_url);
# does the link interest us? # does the link interest us?
if ( ($tagname eq 'a') and exists($attrs{'href'}) ) { if ( ($tagname eq 'a') and exists($attrs{'href'}) ) {
my $href = quotemeta($attrs{'href'}); # quotemeta? my $href = quotemeta($attrs{'href'}); # quotemeta?
$href =~ s!\&!\&amp\;!g; # quotemeta kills &amp; things for some reason $href =~ s!\&!\&amp\;!g; # quotemeta kills &amp; things for some reason
# skip and kill special or "edit" links # skip and kill special or "edit" links
if ( $attrs{'href'} =~ m!docs/index\.php\?!i ) { if ( $attrs{'href'} =~ m!index\.php\?!i ) {
$content =~ s!<a href=\"${href}\".*?>(.*?)</a>!$1!gi; $content =~ s!<a href=\"${href}\".*?>(.*?)</a>!$1!gi;
next(LINKLIST); next(LINKLIST);
} }
@ -198,7 +209,7 @@ sub parse_and_fix_html {
push(@links_to_modify, $attrs{'href'}); push(@links_to_modify, $attrs{'href'});
next(LINKLIST); next(LINKLIST);
} }
# does it go within aegisub.cellosoft.com? # does it go within aegisub.cellosoft.com?
if ( $attrs{'href'} =~ m!^$quoted!i or (substr($attrs{'href'},0,1) eq '/') ) { if ( $attrs{'href'} =~ m!^$quoted!i or (substr($attrs{'href'},0,1) eq '/') ) {
push(@links_to_modify, $attrs{'href'}); push(@links_to_modify, $attrs{'href'});
@ -206,7 +217,7 @@ sub parse_and_fix_html {
# is not relative and goes somewhere else than aegisub.cellosoft.com # is not relative and goes somewhere else than aegisub.cellosoft.com
# so we're not interested in it (#anchor links are not touched either) # so we're not interested in it (#anchor links are not touched either)
else { next(LINKLIST); } else { next(LINKLIST); }
push(@links, URI->new_abs($attrs{'href'}, $base)); push(@links, URI->new_abs($attrs{'href'}, $base));
} }
elsif ( ($tagname eq 'link') and exists($attrs{'href'}) ) { elsif ( ($tagname eq 'link') and exists($attrs{'href'}) ) {
@ -214,53 +225,40 @@ sub parse_and_fix_html {
push(@links_to_modify, $attrs{'href'}); push(@links_to_modify, $attrs{'href'});
} }
else { next(LINKLIST); } else { next(LINKLIST); }
push(@links, URI->new_abs($attrs{'href'}, $base)); push(@links, URI->new_abs($attrs{'href'}, $base));
} }
elsif ( ($tagname eq 'script') and exists($attrs{'src'}) ) { elsif ( ($tagname eq 'script') and exists($attrs{'src'}) ) {
my $src = quotemeta($attrs{'src'}); my $src = quotemeta($attrs{'src'});
# bogus link, skip it # bogus link, skip it
if ( $attrs{'src'} =~ m!index\.php\?title=-!i ) { if ( $attrs{'src'} =~ m!index\.php\?title=-!i ) {
next(LINKLIST); next(LINKLIST);
} }
if ( $attrs{'src'} =~ m!^$quoted!i or (substr($attrs{'src'},0,1) eq '/') ) { if ( $attrs{'src'} =~ m!^$quoted!i or (substr($attrs{'src'},0,1) eq '/') ) {
push(@links_to_modify, $attrs{'src'}); push(@links_to_modify, $attrs{'src'});
} }
else { next(LINKLIST); } else { next(LINKLIST); }
push(@links, URI->new_abs($attrs{'src'}, $base)); push(@links, URI->new_abs($attrs{'src'}, $base));
} }
elsif ( ($tagname eq 'img') and exists($attrs{'src'}) ) { elsif ( ($tagname eq 'img') and exists($attrs{'src'}) ) {
if ( $attrs{'src'} =~ m!^$quoted!i or (substr($attrs{'src'},0,1) eq '/') ) { if ( $attrs{'src'} =~ m!^$quoted!i or (substr($attrs{'src'},0,1) eq '/') ) {
# "flatten" image links # "flatten" image links
my $flatlink = $attrs{'src'}; my $flatlink = $attrs{'src'};
$flatlink =~ s!/docs/images/.+/(.+?\.(jpg|gif|png))!${base_dir}/images/$1!i; $flatlink =~ s!/manual/images/.+/(.+?\.(jpg|gif|png))!${base_dir}/images/$1!i;
$flatlink =~ s!/manual_real/images/.+/(.+?\.(jpg|gif|png))!${base_dir}/images/$1!i;
my $quotedsrc = quotemeta($attrs{'src'}); my $quotedsrc = quotemeta($attrs{'src'});
$content =~ s!$quotedsrc!$flatlink!; $content =~ s!$quotedsrc!$flatlink!;
#push(@links_to_modify, $attrs{'src'});
} }
else { next(LINKLIST); } else { next(LINKLIST); }
push(@links, URI->new_abs($attrs{'src'}, $base)); push(@links, URI->new_abs($attrs{'src'}, $base));
} }
# else do nothing # else do nothing
} }
# kill the topbar
$content =~ s!<div id=\"topbar\".*?<\!-- end topbar -->!!s;
# kill the article/discussion/history thing
$content =~ s!<div id=\"p-cactions\".*?</div>!!s;
# kill the "toolbox" at the bottom left
$content =~ s!<div class=\"portlet\" id=\"p-tb\".*?(<\!-- end of the left)!$1!s;
# kill "recent changes"
$content =~ s!<li id=\"n-recentchanges\">.*?</li>!!;
# handle the @import links to get the css right # handle the @import links to get the css right
while ( $content =~ m!\@import \"(.+?)\";!mg ) { while ( $content =~ m!\@import \"(.+?)\";!mg ) {
my $importlink = $1; my $importlink = $1;
@ -270,8 +268,8 @@ sub parse_and_fix_html {
push(@links_to_modify, '@' . $importlink); push(@links_to_modify, '@' . $importlink);
} }
} }
# rewrite all the links # rewrite all the links
foreach my $link (@links_to_modify) { foreach my $link (@links_to_modify) {
my $converted = convert_link($link); my $converted = convert_link($link);
@ -281,63 +279,56 @@ sub parse_and_fix_html {
$link = quotemeta($link); $link = quotemeta($link);
$content =~ s!\"$link\"!\"$converted\"!g; $content =~ s!\"$link\"!\"$converted\"!g;
} }
$url =~ s!manual_real!manual!;
my $filename = convert_link($url); $url =~ s!http://docs.aegisub.org!!;
return($filename, $content, @links); return($url, $content, @links);
} }
sub write_to_disk { sub write_to_disk {
my ($path, $type, $thing) = @_; my ($path, $type, $thing) = @_;
# return() if ( -e $path ); # this was a dumb idea # return() if ( -e $path ); # this was a dumb idea
$path =~ m!(.*)/(.*?)\.\w{2,4}$!; $path =~ m!/(.*)/(.*?)$!;
my ($tree, $filename) = ($1, $2); my ($tree, $filename) = ($1, $2);
# is it an image link?
if ( $tree =~ m!\./images/! ) {
# hax it
$path =~ s!/images/.+/!/images/!i;
$tree =~ s!/images.+!/images!i;
}
# I don't think this is necessary really # I don't think this is necessary really
mkpath($tree) unless ( -e $tree and -d $tree ); mkpath($tree) unless ( -e $tree and -d $tree );
if ( $type =~ m!^text! ) { if ( $type =~ m!^text! ) {
write_text($path, $thing); write_text('.' . $path, $thing);
} }
else { else {
write_bin($path, $thing); write_bin('.' . $path, $thing);
} }
print("Writing $filename to ${path}...\n"); print("Writing $filename to ${path}...\n");
$writes++; $writes++;
} }
sub write_text { sub write_text {
my ($outfile, $thing) = @_; my ($outfile, $thing) = @_;
open(OUT, ">:utf8", $outfile) or die("Couldn't open $outfile for writing: $!"); open(OUT, ">:utf8", $outfile) or die("Couldn't open $outfile for writing: $!");
print OUT $thing; print OUT $thing;
close(OUT) or die("Couldn't close ${outfile}: $!"); close(OUT) or die("Couldn't close ${outfile}: $!");
return(); return();
} }
sub write_bin { sub write_bin {
my ($outfile, $thing) = @_; my ($outfile, $thing) = @_;
open(OUT, ">", $outfile) or die("Couldn't open $outfile for writing: $!"); open(OUT, ">", $outfile) or die("Couldn't open $outfile for writing: $!");
binmode(OUT); binmode(OUT);
print OUT $thing; print OUT $thing;
close(OUT) or die("Couldn't close ${outfile}: $!"); close(OUT) or die("Couldn't close ${outfile}: $!");
return(); return();
} }
@ -345,18 +336,33 @@ sub write_bin {
# converts links to relative starting with $base_dir # converts links to relative starting with $base_dir
sub convert_link { sub convert_link {
my $link = shift(@_); my $link = shift(@_);
# dereference if necessary # dereference if necessary
if ( ref($link) ) { if ( ref($link) ) {
$link = $$link; $link = $$link;
} }
# SPECIAL CASE: it's one of those fukken @import links, do something else with it # SPECIAL CASE: it's one of those fukken @import links, do something else with it
if ( substr($link,0,1) eq '@' ) { if ( substr($link,0,1) eq '@' ) {
substr($link,0,1) = ''; substr($link,0,1) = '';
return(convert_css_link($link)); return(convert_css_link($link));
} }
elsif ($link =~ /\.css$/) {
return(convert_css_link($link));
}
elsif ($link =~ /\.js/) {
return(convert_js_link($link));
}
$link =~ s!http://docs.aegisub.org!!;
$link =~ s!/manual/images/.+/(.+?\.(jpg|gif|png))!/manual/images/$1!i;
$link =~ s!/manual_real/images/.+/(.+?\.(jpg|gif|png))!/manual/images/$1!i;
$link =~ s!manual_real/skins/.*?!manual/!;
#print("link: $link\n");
return($link);
# is it relative? # is it relative?
if ( substr($link,0,1) eq '/' ) { if ( substr($link,0,1) eq '/' ) {
$link =~ s!^/docs/!$base_dir/!i; $link =~ s!^/docs/!$base_dir/!i;
@ -365,7 +371,7 @@ sub convert_link {
my $quoted = quotemeta($host); my $quoted = quotemeta($host);
$link =~ s!${quoted}/docs/!$base_dir/!i; $link =~ s!${quoted}/docs/!$base_dir/!i;
} }
# if it doesn't have a filename extension it's probably a page, # if it doesn't have a filename extension it's probably a page,
# and then we need to tack on .html to the end (fuck internet explorer) # and then we need to tack on .html to the end (fuck internet explorer)
# oh and jfs's .lua pages aren't really lua scripts either # oh and jfs's .lua pages aren't really lua scripts either
@ -376,55 +382,60 @@ sub convert_link {
$link =~ s!^${pagename}!${pagename}.html!; $link =~ s!^${pagename}!${pagename}.html!;
} }
} elsif ( $link !~ m!/.*?\.\w{2,4}$! or (substr($link,-4) eq '.lua') ) { } elsif ( $link !~ m!/.*?\.\w{2,4}$! or (substr($link,-4) eq '.lua') ) {
$link = $link . '.html'; $link = $link . '.html';
} }
$link =~ s!\:!_!g; # replace : with _ $link =~ s!\:!_!g; # replace : with _
$link =~ s!\?!_!g; # replace ? with _ $link =~ s!\?!_!g; # replace ? with _
return($link); return($link);
} }
# HAX # HAX
sub convert_css_link { sub convert_css_link {
my $link = shift(@_); my $link = shift(@_);
# does it seem like css? # does it seem like css?
if ( $link =~ m!MediaWiki:(.+?)\.css!i ) { if ( $link =~ m!MediaWiki:(.+?)\.css!i ) {
return(convert_link('/docs/' . $1 . '.css')); return("/manual/css/$1.css");
} }
# has a sensible name already, don't fuck with it # has a sensible name already, don't fuck with it
elsif ( $link =~ m!/(.+?)\.css$!i ) { elsif ( $link =~ m!/([^/]+?)\.css$!i ) {
return(convert_link($link)); return("/manual/css/$1.css");
} }
# doesn't seem like anything useful # doesn't seem like anything useful
else { return(undef); } else {
print("UNKNOWN CSS: $link\n");
return(undef);
}
} }
sub convert_js_link {
my $link = shift(@_);
$link =~ m!/([^/]+\.js)!;
return("/manual/js/$1");
}
# argh # argh
sub parse_css { sub parse_css {
my ($url, $base, $content) = @_; my ($url, $base, $content) = @_;
my @links; my @links;
# my $quoted = quotemeta($docs_base_url); # <--- not used # my $quoted = quotemeta($docs_base_url); # <--- not used
# find url("stuff") blocks # find url("stuff") blocks
LINKLIST: LINKLIST:
while ( $content =~ m!url\((\")?(.+?)(\")?\)!mgi ) { while ( $content =~ m!url\((\")?(.+?)(\")?\)!mgi ) {
my $text = $2; my $text = $2;
# skip it if it's nonrelative and goes somewhere else than aegisub.cellosoft.com # skip it if it's nonrelative and goes somewhere else than aegisub.cellosoft.com
if ( $text =~ m!^http!i ) { if ( $text =~ m!^http!i ) {
# actually fuck this there shouldn't be any nonrelative links in there anyway # actually fuck this there shouldn't be any nonrelative links in there anyway
next(LINKLIST) next(LINKLIST)
#unless ( $text =~ m!^${quoted}!i ); #unless ( $text =~ m!^${quoted}!i );
} }
push(@links, URI->new_abs($text, $base)); push(@links, URI->new_abs($text, $base));
} }
my $filename = convert_link($url); return(@links);
}
return($filename, @links);
}