From 598d3d77e6779c8bfd581d0934d5dfdc4682cc48 Mon Sep 17 00:00:00 2001 From: Thomas Goyne Date: Fri, 31 Aug 2012 12:51:00 -0700 Subject: [PATCH] Modify aegisub_convert_docs.pl to produce a static mirror suitable for serving on a web site --- docs/wiki_convert/aegisub_convert_docs.pl | 203 ++++++++++++---------- 1 file changed, 107 insertions(+), 96 deletions(-) mode change 100644 => 100755 docs/wiki_convert/aegisub_convert_docs.pl diff --git a/docs/wiki_convert/aegisub_convert_docs.pl b/docs/wiki_convert/aegisub_convert_docs.pl old mode 100644 new mode 100755 index 67ff4e36b..76989c89d --- a/docs/wiki_convert/aegisub_convert_docs.pl +++ b/docs/wiki_convert/aegisub_convert_docs.pl @@ -30,12 +30,13 @@ use File::Path; # variables my $base_dir = '.'; -my $host = 'http://aegisub.cellosoft.com'; -my $docs_base_url = 'http://aegisub.cellosoft.com/docs/'; +my $host = 'http://docs.aegisub.org'; +my $docs_base_url = 'http://docs.aegisub.org/manual/'; my $agent = LWP::UserAgent->new(); my @process_pages = ($docs_base_url . 'Main_Page'); # start at the main page -my %pages = ($base_dir . '/Main_Page.html' => $docs_base_url . "Main_Page"); +my %pages = ($base_dir . '/Main_Page' => $docs_base_url . "Main_Page"); my %accepted_types = ( + 'application/javascript' => '.js', 'image/png' => '.png', 'image/jpeg' => '.jpg', 'image/gif' => '.gif', @@ -64,42 +65,40 @@ while ( @process_pages ) { # initialize object and download the page my $page_object = $agent->get($current_page); print("Requesting ${current_page}...\n"); - + # warn and skip if we couldn't get it unless ( $page_object->is_success() ) { warn("Couldn't get ", $current_page, ": ", $page_object->status_line(), ", skipping\n"); next(GETLOOP); } - + my $content_type = $page_object->content_type(); - + # skip if we don't know what it is unless ( exists($accepted_types{$content_type}) ) { warn("I don't know what to do with ", $content_type, ", skipping\n"); next(GETLOOP); } - - + # monstrously ugly hack to handle rewriting of .css filenames my $name; - if ( $current_page =~ m!index\.php\?title=MediaWiki\:.+?\.css!i ) { + if ( $current_page =~ m!css$!i ) { $name = convert_css_link($current_page); } else { $name = convert_link($current_page); } - - + # if it's html, parse it, grab new links # add them to @process_pages if they're not already in there # then write the modified html to disk if ( $content_type eq 'text/html' ) { my ($filename, $content, @newpages) = parse_and_fix_html($current_page, $page_object->base(), $page_object->decoded_content()); - + # skip this page if the parser decided it was a page we don't want next(GETLOOP) unless ($content); - + write_to_disk($filename, $content_type, $content); - + foreach my $url (@newpages) { my $newname = convert_link($url); # check if we already added that page to our todo-list @@ -116,9 +115,9 @@ while ( @process_pages ) { # if it's CSS we need the @import'ed links elsif ( $content_type eq 'text/css' ) { my @newpages = parse_css($current_page, $page_object->base(), $page_object->decoded_content()); - + write_to_disk($name, $content_type, $page_object->decoded_content()); - + foreach my $url (@newpages) { my $newname = convert_link($url); # check if we already added that page to our todo-list @@ -158,30 +157,42 @@ sub parse_and_fix_html { #get arguments my ($url, $base, $content) = @_; my (@links, @links_to_modify); # list of links to return later - - + + # strip RSS etc. + $content =~ s!]*xml[^>]* />!!gi; + + # kill the topbar + $content =~ s!
!!s; + + # kill the article/discussion/history thing + $content =~ s!
!!s; + + # kill the "toolbox" at the bottom left + $content =~ s!
.*?!!; + # parse HTML my $html = HTML::LinkExtor->new(); - # $html->utf8_mode(1); # not needed $html->parse($content); $html->eof(); - - + # loop over the list of links # all real work is done here LINKLIST: foreach my $tag ( $html->links() ) { my ($tagname, %attrs) = @{$tag}; - + my $quoted = quotemeta($docs_base_url); - + # does the link interest us? if ( ($tagname eq 'a') and exists($attrs{'href'}) ) { my $href = quotemeta($attrs{'href'}); # quotemeta? $href =~ s!\&!\&\;!g; # quotemeta kills & things for some reason - + # skip and kill special or "edit" links - if ( $attrs{'href'} =~ m!docs/index\.php\?!i ) { + if ( $attrs{'href'} =~ m!index\.php\?!i ) { $content =~ s!(.*?)!$1!gi; next(LINKLIST); } @@ -198,7 +209,7 @@ sub parse_and_fix_html { push(@links_to_modify, $attrs{'href'}); next(LINKLIST); } - + # does it go within aegisub.cellosoft.com? if ( $attrs{'href'} =~ m!^$quoted!i or (substr($attrs{'href'},0,1) eq '/') ) { push(@links_to_modify, $attrs{'href'}); @@ -206,7 +217,7 @@ sub parse_and_fix_html { # is not relative and goes somewhere else than aegisub.cellosoft.com # so we're not interested in it (#anchor links are not touched either) else { next(LINKLIST); } - + push(@links, URI->new_abs($attrs{'href'}, $base)); } elsif ( ($tagname eq 'link') and exists($attrs{'href'}) ) { @@ -214,53 +225,40 @@ sub parse_and_fix_html { push(@links_to_modify, $attrs{'href'}); } else { next(LINKLIST); } - + push(@links, URI->new_abs($attrs{'href'}, $base)); } elsif ( ($tagname eq 'script') and exists($attrs{'src'}) ) { my $src = quotemeta($attrs{'src'}); - + # bogus link, skip it if ( $attrs{'src'} =~ m!index\.php\?title=-!i ) { next(LINKLIST); } - + if ( $attrs{'src'} =~ m!^$quoted!i or (substr($attrs{'src'},0,1) eq '/') ) { push(@links_to_modify, $attrs{'src'}); } else { next(LINKLIST); } - + push(@links, URI->new_abs($attrs{'src'}, $base)); } elsif ( ($tagname eq 'img') and exists($attrs{'src'}) ) { if ( $attrs{'src'} =~ m!^$quoted!i or (substr($attrs{'src'},0,1) eq '/') ) { # "flatten" image links my $flatlink = $attrs{'src'}; - $flatlink =~ s!/docs/images/.+/(.+?\.(jpg|gif|png))!${base_dir}/images/$1!i; + $flatlink =~ s!/manual/images/.+/(.+?\.(jpg|gif|png))!${base_dir}/images/$1!i; + $flatlink =~ s!/manual_real/images/.+/(.+?\.(jpg|gif|png))!${base_dir}/images/$1!i; my $quotedsrc = quotemeta($attrs{'src'}); $content =~ s!$quotedsrc!$flatlink!; - #push(@links_to_modify, $attrs{'src'}); } else { next(LINKLIST); } - + push(@links, URI->new_abs($attrs{'src'}, $base)); } # else do nothing } - - - # kill the topbar - $content =~ s!
!!s; - - # kill the article/discussion/history thing - $content =~ s!
!!s; - - # kill the "toolbox" at the bottom left - $content =~ s!
.*?!!; - + # handle the @import links to get the css right while ( $content =~ m!\@import \"(.+?)\";!mg ) { my $importlink = $1; @@ -270,8 +268,8 @@ sub parse_and_fix_html { push(@links_to_modify, '@' . $importlink); } } - - + + # rewrite all the links foreach my $link (@links_to_modify) { my $converted = convert_link($link); @@ -281,63 +279,56 @@ sub parse_and_fix_html { $link = quotemeta($link); $content =~ s!\"$link\"!\"$converted\"!g; } - - - my $filename = convert_link($url); - - return($filename, $content, @links); + + $url =~ s!manual_real!manual!; + $url =~ s!http://docs.aegisub.org!!; + + return($url, $content, @links); } sub write_to_disk { my ($path, $type, $thing) = @_; # return() if ( -e $path ); # this was a dumb idea - - $path =~ m!(.*)/(.*?)\.\w{2,4}$!; + + $path =~ m!/(.*)/(.*?)$!; my ($tree, $filename) = ($1, $2); - - # is it an image link? - if ( $tree =~ m!\./images/! ) { - # hax it - $path =~ s!/images/.+/!/images/!i; - $tree =~ s!/images.+!/images!i; - } - + # I don't think this is necessary really mkpath($tree) unless ( -e $tree and -d $tree ); - + if ( $type =~ m!^text! ) { - write_text($path, $thing); + write_text('.' . $path, $thing); } else { - write_bin($path, $thing); + write_bin('.' . $path, $thing); } - + print("Writing $filename to ${path}...\n"); - + $writes++; } sub write_text { my ($outfile, $thing) = @_; - + open(OUT, ">:utf8", $outfile) or die("Couldn't open $outfile for writing: $!"); print OUT $thing; close(OUT) or die("Couldn't close ${outfile}: $!"); - + return(); } sub write_bin { my ($outfile, $thing) = @_; - + open(OUT, ">", $outfile) or die("Couldn't open $outfile for writing: $!"); binmode(OUT); print OUT $thing; close(OUT) or die("Couldn't close ${outfile}: $!"); - + return(); } @@ -345,18 +336,33 @@ sub write_bin { # converts links to relative starting with $base_dir sub convert_link { my $link = shift(@_); - + # dereference if necessary if ( ref($link) ) { $link = $$link; } - + # SPECIAL CASE: it's one of those fukken @import links, do something else with it if ( substr($link,0,1) eq '@' ) { substr($link,0,1) = ''; return(convert_css_link($link)); - } - + } + elsif ($link =~ /\.css$/) { + return(convert_css_link($link)); + } + elsif ($link =~ /\.js/) { + return(convert_js_link($link)); + } + + $link =~ s!http://docs.aegisub.org!!; + $link =~ s!/manual/images/.+/(.+?\.(jpg|gif|png))!/manual/images/$1!i; + $link =~ s!/manual_real/images/.+/(.+?\.(jpg|gif|png))!/manual/images/$1!i; + + $link =~ s!manual_real/skins/.*?!manual/!; + + #print("link: $link\n"); + return($link); + # is it relative? if ( substr($link,0,1) eq '/' ) { $link =~ s!^/docs/!$base_dir/!i; @@ -365,7 +371,7 @@ sub convert_link { my $quoted = quotemeta($host); $link =~ s!${quoted}/docs/!$base_dir/!i; } - + # if it doesn't have a filename extension it's probably a page, # and then we need to tack on .html to the end (fuck internet explorer) # oh and jfs's .lua pages aren't really lua scripts either @@ -376,55 +382,60 @@ sub convert_link { $link =~ s!^${pagename}!${pagename}.html!; } } elsif ( $link !~ m!/.*?\.\w{2,4}$! or (substr($link,-4) eq '.lua') ) { - $link = $link . '.html'; + $link = $link . '.html'; } - + $link =~ s!\:!_!g; # replace : with _ $link =~ s!\?!_!g; # replace ? with _ - + return($link); } - # HAX sub convert_css_link { my $link = shift(@_); - + # does it seem like css? if ( $link =~ m!MediaWiki:(.+?)\.css!i ) { - return(convert_link('/docs/' . $1 . '.css')); + return("/manual/css/$1.css"); } # has a sensible name already, don't fuck with it - elsif ( $link =~ m!/(.+?)\.css$!i ) { - return(convert_link($link)); + elsif ( $link =~ m!/([^/]+?)\.css$!i ) { + return("/manual/css/$1.css"); } # doesn't seem like anything useful - else { return(undef); } + else { + print("UNKNOWN CSS: $link\n"); + return(undef); + } } +sub convert_js_link { + my $link = shift(@_); + $link =~ m!/([^/]+\.js)!; + return("/manual/js/$1"); +} # argh sub parse_css { my ($url, $base, $content) = @_; my @links; # my $quoted = quotemeta($docs_base_url); # <--- not used - + # find url("stuff") blocks LINKLIST: while ( $content =~ m!url\((\")?(.+?)(\")?\)!mgi ) { my $text = $2; - + # skip it if it's nonrelative and goes somewhere else than aegisub.cellosoft.com if ( $text =~ m!^http!i ) { # actually fuck this there shouldn't be any nonrelative links in there anyway next(LINKLIST) #unless ( $text =~ m!^${quoted}!i ); } - + push(@links, URI->new_abs($text, $base)); } - - my $filename = convert_link($url); - - return($filename, @links); -} \ No newline at end of file + + return(@links); +}