From 04cbb3a74fed583537085f9d87ab1f2231a89913 Mon Sep 17 00:00:00 2001 From: Karl Blomster Date: Tue, 15 Jan 2008 01:18:49 +0000 Subject: [PATCH] Added a Perl script that converts the online wiki manual to static HTML. Still has a few annoying parsing bugs but it's mostly complete. Originally committed to SVN as r1724. --- docs/aegisub_convert_docs.pl | 387 +++++++++++++++++++++++++++++++++++ 1 file changed, 387 insertions(+) create mode 100644 docs/aegisub_convert_docs.pl diff --git a/docs/aegisub_convert_docs.pl b/docs/aegisub_convert_docs.pl new file mode 100644 index 000000000..81ee766cd --- /dev/null +++ b/docs/aegisub_convert_docs.pl @@ -0,0 +1,387 @@ +#!/usr/bin/perl + +######################## +# +# aegisub_convert_docs.pl - downloads and converts the Aegisub documentation wiki to static HTML. +# +# Usage: aegisub_convert_docs.pl +# Will write the entire wiki to the current directory. +# +# Warning: ugly hacking inside. +# +######### +# +# No copyright, no license. +# +######################## + + +# includes +use warnings; +use strict; +use utf8; +use LWP; +use URI; +# use CSS; # <--- fucking useless +use HTML::LinkExtor; +use File::Path; + + +# variables +my $base_dir = '.'; +my $host = 'http://aegisub.cellosoft.com'; +my $docs_base_url = 'http://aegisub.cellosoft.com/docs/'; +my $agent = LWP::UserAgent->new(); +my @process_pages = ($docs_base_url . 'Main_Page'); # start at the main page +my %pages = ($base_dir . '/Main_Page.html' => $docs_base_url . "Main_Page"); +my %accepted_types = ( + 'image/png' => '.png', + 'image/jpeg' => '.jpg', + 'image/gif' => '.gif', + 'text/html' => '.html', + 'text/css' => '.css', + 'text/javascript' => '.js', + 'text/plain' => '.txt', +); +my $timestamp = time(); +my ($requests, $writes); + + +# make sure we have somewhere to write to +mkdir($base_dir) or die("Couldn't create directory ", $base_dir, ": $!") + unless ( -e $base_dir and -d $base_dir ); +chdir($base_dir) or die("Couldn't change directory to ", $base_dir, ": $!"); + + +print("Starting downloading and conversion of documentation wiki to ", $base_dir, ".\n", + "This will probably take a while.\n"); + + +GETLOOP: +while ( @process_pages ) { + my $current_page = shift(@process_pages); + # initialize object and download the page + my $page_object = $agent->get($current_page); + print("Requesting ${current_page}...\n"); + + # warn and skip if we couldn't get it + unless ( $page_object->is_success() ) { + warn("Couldn't get ", $current_page, ": ", $page_object->status_line(), ", skipping\n"); + next(GETLOOP); + } + + my $content_type = $page_object->content_type(); + + # skip if we don't know what it is + unless ( exists($accepted_types{$content_type}) ) { + warn("I don't know what to do with ", $content_type, ", skipping\n"); + next(GETLOOP); + } + + + # monstrously ugly hack to handle rewriting of .css filenames + my $name; + if ( $current_page =~ m!index\.php\?title=MediaWiki\:.+?\.css!i ) { + $name = convert_css_link($current_page); + } else { + $name = convert_link($current_page); + } + + + # if it's html, parse it, grab new links + # add them to @process_pages if they're not already in there + # then write the modified html to disk + if ( $content_type eq 'text/html' ) { + my ($filename, $content, @newpages) = parse_and_fix_html($current_page, $page_object->base(), $page_object->decoded_content()); + + # skip this page if the parser decided it was a page we don't want + next(GETLOOP) unless ($content); + + write_to_disk($filename, $content_type, $content); + + foreach my $url (@newpages) { + my $newname = convert_link($url); + # check if we already added that page to our todo-list + if ( exists($pages{$newname}) ) { + next(); # we did, do nothing + } + else { + # new page, add it to the list of things to process + push(@process_pages, $url); + $pages{$newname} = $url; + } + } + } + # if it's CSS we need the @import'ed links + elsif ( $content_type eq 'text/css' ) { + my @newpages = parse_css($current_page, $page_object->base(), $page_object->decoded_content()); + + write_to_disk($name, $content_type, $page_object->decoded_content()); + + foreach my $url (@newpages) { + my $newname = convert_link($url); + # check if we already added that page to our todo-list + if ( exists($pages{$newname}) ) { + next(); # we did, do nothing + } + else { + # new page, add it to the list of things to process + push(@process_pages, $url); + $pages{$newname} = $url; + } + } + } + else { + write_to_disk($name, $content_type, $page_object->decoded_content()); + } +} +continue { + $requests++; +} + + +print("Done.\nMade $requests requests and wrote $writes files in ", + time()-$timestamp, " seconds.\n"); + +exit(0); + + + +########################################## + +sub parse_and_fix_html { + # parse out pages this page links to + # modify it + # return filename of the page, modified html and list of links + + #get arguments + my ($url, $base, $content) = @_; + my (@links, @links_to_modify); # list of links to return later + + + # parse HTML + my $html = HTML::LinkExtor->new(); + # $html->utf8_mode(1); # not needed + $html->parse($content); + $html->eof(); + + + # loop over the list of links + # all real work is done here + LINKLIST: + foreach my $tag ( $html->links() ) { + my ($tagname, %attrs) = @{$tag}; + + my $quoted = quotemeta($host); + + # does the link interest us? + if ( ($tagname eq 'a') and exists($attrs{'href'}) ) { + my $href = quotemeta($attrs{'href'}); # quotemeta? + $href =~ s!\&!\&\;!g; # quotemeta kills & things + + # skip and kill special or "edit" links + if ( $attrs{'href'} =~ m!docs/index\.php\?!i ) { + $content =~ s!(.*?)!$3!gi; + next(LINKLIST); + } + # skip and kill image/special links + if ( $attrs{'href'} =~ m!(Special\:|Image\:|Talk\:)!i ) { + $content =~ s!(.*?)!$1!gi; + next(LINKLIST); + } + # don't process #anchor links + if ( $attrs{'href'} =~ m!\#(.*?)$! ) { + next(LINKLIST); + } + + # does it go within aegisub.cellosoft.com? + if ( $attrs{'href'} =~ m!^$quoted!i or (substr($attrs{'href'},0,1) eq '/') ) { + push(@links_to_modify, $attrs{'href'}); + } + # is not relative and goes somewhere else than aegisub.cellosoft.com + # so we're not interested in it + else { next(LINKLIST); } + + push(@links, URI->new_abs($attrs{'href'}, $base)); + } + elsif ( ($tagname eq 'link') and exists($attrs{'href'}) ) { + if ( $attrs{'href'} =~ m!^$quoted!i or (substr($attrs{'href'},0,1) eq '/') ) { + push(@links_to_modify, $attrs{'href'}); + } + else { next(LINKLIST); } + + push(@links, URI->new_abs($attrs{'href'}, $base)); + } + elsif ( ($tagname eq 'script') and exists($attrs{'src'}) ) { + my $src = quotemeta($attrs{'src'}); + + # bogus link, skip it + if ( $attrs{'src'} =~ m!index\.php\?title=-!i ) { + next(LINKLIST); + } + + if ( $attrs{'src'} =~ m!^$quoted!i or (substr($attrs{'src'},0,1) eq '/') ) { + push(@links_to_modify, $attrs{'src'}); + } + else { next(LINKLIST); } + + push(@links, URI->new_abs($attrs{'src'}, $base)); + } + elsif ( ($tagname eq 'img') and exists($attrs{'src'}) ) { + if ( $attrs{'src'} =~ m!^$quoted!i or (substr($attrs{'src'},0,1) eq '/') ) { + push(@links_to_modify, $attrs{'src'}); + } + else { next(LINKLIST); } + + push(@links, URI->new_abs($attrs{'src'}, $base)); + } + # else do nothing + } + + + # kill the topbar + $content =~ s!\
!!s; + + + # handle the @import links to get the css right + while ( $content =~ m!\@import \"(.+?)\";!mg ) { + my $importlink = $1; + + if ( convert_css_link($importlink) ) { + push(@links, URI->new_abs($importlink, $base)); + push(@links_to_modify, '@' . $importlink); + } + } + + + # rewrite all the links + foreach my $link (@links_to_modify) { + my $converted = convert_link($link); + if ( substr($link,0,1) eq '@' ) { + substr($link,0,1) = ''; + } + $link = quotemeta($link); + $content =~ s!$link!$converted!g; + } + + + my $filename = convert_link($url); + + return($filename, $content, @links); +} + + +sub write_to_disk { + my ($path, $type, $thing) = @_; + # return() if ( -e $path ); # this was a dumb idea + + $path =~ m!(.*)/(.*?)\.\w{2,4}$!; + my ($tree, $filename) = ($1, $2); + + mkpath($tree) unless ( -e $tree and -d $tree ); + + if ( $type =~ m!^text! ) { + write_text($path, $thing); + } + else { + write_bin($path, $thing); + } + + print("Writing $filename to ${path}...\n"); + + $writes++; +} + + +sub write_text { + my ($outfile, $thing) = @_; + + open(OUT, ">:utf8", $outfile) or die("Couldn't open $outfile for writing: $!"); + print OUT $thing; + close(OUT) or die("Couldn't close ${outfile}: $!"); + + return(); +} + + +sub write_bin { + my ($outfile, $thing) = @_; + + open(OUT, ">", $outfile) or die("Couldn't open $outfile for writing: $!"); + binmode(OUT); + print OUT $thing; + close(OUT) or die("Couldn't close ${outfile}: $!"); + + return(); +} + + +# converts links to relative starting with $base_dir +sub convert_link { + my $link = shift(@_); + + # dereference if necessary + if ( ref($link) ) { + $link = $$link; + } + + # SPECIAL CASE: it's one of those fukken @ import links, do something else with it + if ( substr($link,0,1) eq '@' ) { + substr($link,0,1) = ''; + return(convert_css_link($link)); + } + + # is it relative? + if ( substr($link,0,1) eq '/' ) { + $link =~ s!^/docs/!$base_dir/!i; + } + else { + my $quoted = quotemeta($host); + $link =~ s!${quoted}/docs/!$base_dir/!i; + } + + # if it doesn't have a filename extension it's probably a page, + # and then we need to tack on .html to the end (fuck internet explorer) + # oh and jfs's .lua pages aren't lua scripts either + if ( $link !~ m!/.*?\.\w{2,4}$! or (substr($link,-4) eq '.lua') ) { + $link = $link . '.html'; + } + + $link =~ s!\:!_!g; # replace : with _ + $link =~ s!\?!_!g; # replace ? with _ + + return($link); +} + + +# HAX +sub convert_css_link { + my $link = shift(@_); + + # does it seem like css? + if ( $link =~ m!MediaWiki:(.+?)\.css!i ) { + return(convert_link('/docs/' . $1 . '.css')); + } + # has a sensible name already, don't fuck with it + elsif ( $link =~ m!/(.+?)\.css$!i ) { + return(convert_link($link)); + } + # doesn't seem like anything useful + else { return(undef); } +} + + +# argh +sub parse_css { + my ($url, $base, $content) = @_; + my @links; + + LINKLIST: + while ( $content =~ m!url\(\"(.+?)\"\)!mgi ) { + push(@links, URI->new_abs($1, $base)); + } + + my $filename = convert_link($url); + + return($filename, @links); +} \ No newline at end of file