From 04cbb3a74fed583537085f9d87ab1f2231a89913 Mon Sep 17 00:00:00 2001
From: Karl Blomster <kalle.blomster@gmail.com>
Date: Tue, 15 Jan 2008 01:18:49 +0000
Subject: [PATCH] Added a Perl script that converts the online wiki manual to
 static HTML. Still has a few annoying parsing bugs but it's mostly complete.

Originally committed to SVN as r1724.
---
 docs/aegisub_convert_docs.pl | 387 +++++++++++++++++++++++++++++++++++
 1 file changed, 387 insertions(+)
 create mode 100644 docs/aegisub_convert_docs.pl

diff --git a/docs/aegisub_convert_docs.pl b/docs/aegisub_convert_docs.pl
new file mode 100644
index 000000000..81ee766cd
--- /dev/null
+++ b/docs/aegisub_convert_docs.pl
@@ -0,0 +1,387 @@
+#!/usr/bin/perl
+
+########################
+#
+# aegisub_convert_docs.pl - downloads and converts the Aegisub documentation wiki to static HTML.
+#
+# Usage: aegisub_convert_docs.pl
+# Will write the entire wiki to the current directory.
+#
+# Warning: ugly hacking inside.
+#
+#########
+# 
+# No copyright, no license.
+#
+########################
+
+
+# includes
+use warnings;
+use strict;
+use utf8;
+use LWP;
+use URI;
+# use CSS; # <--- fucking useless
+use HTML::LinkExtor;
+use File::Path;
+
+
+# variables
+my $base_dir = '.';
+my $host = 'http://aegisub.cellosoft.com';
+my $docs_base_url = 'http://aegisub.cellosoft.com/docs/';
+my $agent = LWP::UserAgent->new();
+my @process_pages = ($docs_base_url . 'Main_Page'); # start at the main page
+my %pages = ($base_dir . '/Main_Page.html' => $docs_base_url . "Main_Page");
+my %accepted_types = (
+	'image/png' 	=> '.png',
+	'image/jpeg' 	=> '.jpg',
+	'image/gif' 	=> '.gif',
+	'text/html'		=> '.html',
+	'text/css'		=> '.css',
+	'text/javascript' => '.js',
+	'text/plain'	=> '.txt',
+);
+my $timestamp = time();
+my ($requests, $writes);
+
+
+# make sure we have somewhere to write to
+mkdir($base_dir) or die("Couldn't create directory ", $base_dir, ": $!")
+	unless ( -e $base_dir and -d $base_dir );
+chdir($base_dir) or die("Couldn't change directory to ", $base_dir, ": $!");
+
+
+print("Starting downloading and conversion of documentation wiki to ", $base_dir, ".\n",
+	"This will probably take a while.\n");
+
+
+GETLOOP:
+while ( @process_pages ) {
+	my $current_page = shift(@process_pages);
+	# initialize object and download the page
+	my $page_object = $agent->get($current_page);
+	print("Requesting ${current_page}...\n");
+	
+	# warn and skip if we couldn't get it
+	unless ( $page_object->is_success() ) {
+		warn("Couldn't get ", $current_page, ": ", $page_object->status_line(), ", skipping\n");
+		next(GETLOOP);
+	}
+	
+	my $content_type = $page_object->content_type();
+	
+	# skip if we don't know what it is
+	unless ( exists($accepted_types{$content_type}) ) {
+		warn("I don't know what to do with ", $content_type, ", skipping\n");
+		next(GETLOOP);
+	}
+	
+	
+	# monstrously ugly hack to handle rewriting of .css filenames
+	my $name;
+	if ( $current_page =~ m!index\.php\?title=MediaWiki\:.+?\.css!i ) {
+		$name = convert_css_link($current_page);
+	} else {
+		$name = convert_link($current_page);
+	}
+	
+	
+	# if it's html, parse it, grab new links
+	# add them to @process_pages if they're not already in there
+	# then write the modified html to disk
+	if ( $content_type eq 'text/html' ) {
+		my ($filename, $content, @newpages) = parse_and_fix_html($current_page, $page_object->base(), $page_object->decoded_content());
+		
+		# skip this page if the parser decided it was a page we don't want
+		next(GETLOOP) unless ($content);
+		
+		write_to_disk($filename, $content_type, $content);
+		
+		foreach my $url (@newpages) {
+			my $newname = convert_link($url);
+			# check if we already added that page to our todo-list
+			if ( exists($pages{$newname}) ) {
+				next(); # we did, do nothing
+			}
+			else {
+				# new page, add it to the list of things to process
+				push(@process_pages, $url);
+				$pages{$newname} = $url;
+			}
+		}
+	}
+	# if it's CSS we need the @import'ed links
+	elsif ( $content_type eq 'text/css' ) {
+		my @newpages = parse_css($current_page, $page_object->base(), $page_object->decoded_content());
+		
+		write_to_disk($name, $content_type, $page_object->decoded_content());
+		
+		foreach my $url (@newpages) {
+			my $newname = convert_link($url);
+			# check if we already added that page to our todo-list
+			if ( exists($pages{$newname}) ) {
+				next(); # we did, do nothing
+			}
+			else {
+				# new page, add it to the list of things to process
+				push(@process_pages, $url);
+				$pages{$newname} = $url;
+			}
+		}
+	}
+	else {
+		write_to_disk($name, $content_type, $page_object->decoded_content());
+	}
+}
+continue {
+	$requests++;
+}
+
+
+print("Done.\nMade $requests requests and wrote $writes files in ",
+	time()-$timestamp, " seconds.\n");
+
+exit(0);
+
+
+
+##########################################
+
+sub parse_and_fix_html {
+	# parse out pages this page links to
+	# modify it
+	# return filename of the page, modified html and list of links
+	
+	#get arguments
+	my ($url, $base, $content) = @_;
+	my (@links, @links_to_modify); # list of links to return later
+	
+	
+	# parse HTML
+	my $html = HTML::LinkExtor->new();
+	# $html->utf8_mode(1); # not needed
+	$html->parse($content);
+	$html->eof();
+	
+	
+	# loop over the list of links
+	# all real work is done here
+	LINKLIST:
+	foreach my $tag ( $html->links() ) {
+		my ($tagname, %attrs) = @{$tag};
+		
+		my $quoted = quotemeta($host);
+		
+		# does the link interest us?
+		if ( ($tagname eq 'a') and exists($attrs{'href'}) ) {
+			my $href = quotemeta($attrs{'href'}); # quotemeta?
+			$href =~ s!\&!\&amp\;!g; # quotemeta kills &amp; things
+			
+			# skip and kill special or "edit" links
+			if ( $attrs{'href'} =~ m!docs/index\.php\?!i ) {
+				$content =~ s!<a(.*?)href\=\"${href}\"(.*?)>(.*?)</a>!$3!gi;
+				next(LINKLIST);
+			}
+			# skip and kill image/special links
+			if ( $attrs{'href'} =~ m!(Special\:|Image\:|Talk\:)!i ) {
+				$content =~ s!<a.*?href\=\"${href}\".*?>(.*?)</a>!$1!gi;
+				next(LINKLIST);
+			}
+			# don't process #anchor links
+			if ( $attrs{'href'} =~ m!\#(.*?)$! ) {
+				next(LINKLIST);
+			}
+			
+			# does it go within aegisub.cellosoft.com?
+			if ( $attrs{'href'} =~ m!^$quoted!i or (substr($attrs{'href'},0,1) eq '/') ) {
+				push(@links_to_modify, $attrs{'href'});
+			}
+			# is not relative and goes somewhere else than aegisub.cellosoft.com
+			# so we're not interested in it
+			else { next(LINKLIST); }
+			
+			push(@links, URI->new_abs($attrs{'href'}, $base));
+		}
+		elsif ( ($tagname eq 'link') and exists($attrs{'href'}) ) {
+			if ( $attrs{'href'} =~ m!^$quoted!i or (substr($attrs{'href'},0,1) eq '/') ) {
+				push(@links_to_modify, $attrs{'href'});
+			}
+			else { next(LINKLIST); }
+			
+			push(@links, URI->new_abs($attrs{'href'}, $base));
+		}
+		elsif ( ($tagname eq 'script') and exists($attrs{'src'}) ) {
+			my $src = quotemeta($attrs{'src'});
+			
+			# bogus link, skip it
+			if ( $attrs{'src'} =~ m!index\.php\?title=-!i ) {
+				next(LINKLIST);
+			}
+			
+			if ( $attrs{'src'} =~ m!^$quoted!i or (substr($attrs{'src'},0,1) eq '/') ) {
+				push(@links_to_modify, $attrs{'src'});
+			}
+			else { next(LINKLIST); }
+			
+			push(@links, URI->new_abs($attrs{'src'}, $base));
+		}
+		elsif ( ($tagname eq 'img') and exists($attrs{'src'}) ) {
+			if ( $attrs{'src'} =~ m!^$quoted!i or (substr($attrs{'src'},0,1) eq '/') ) {
+				push(@links_to_modify, $attrs{'src'});
+			}
+			else { next(LINKLIST); }
+			
+			push(@links, URI->new_abs($attrs{'src'}, $base));
+		} 
+		# else do nothing
+	}
+	
+	
+	# kill the topbar
+	$content =~ s!\<div id=\"topbar\".*?\<\!-- end topbar --\>!!s;
+	
+	
+	# handle the @import links to get the css right
+	while ( $content =~ m!\@import \"(.+?)\";!mg ) {
+		my $importlink = $1;
+
+		if ( convert_css_link($importlink) ) {
+			push(@links, URI->new_abs($importlink, $base));
+			push(@links_to_modify, '@' . $importlink);
+		}
+	}
+	
+	
+	# rewrite all the links
+	foreach my $link (@links_to_modify) {
+		my $converted = convert_link($link);
+		if ( substr($link,0,1) eq '@' ) {
+			substr($link,0,1) = ''; 
+		}
+		$link = quotemeta($link);
+		$content =~ s!$link!$converted!g;
+	}
+	
+	
+	my $filename = convert_link($url);
+	
+	return($filename, $content, @links);
+}
+
+
+sub write_to_disk {
+	my ($path, $type, $thing) = @_;
+	# return() if ( -e $path ); # this was a dumb idea
+	
+	$path =~ m!(.*)/(.*?)\.\w{2,4}$!;
+	my ($tree, $filename) = ($1, $2);
+	
+	mkpath($tree) unless ( -e $tree and -d $tree );
+	
+	if ( $type =~ m!^text! ) {
+		write_text($path, $thing);
+	}
+	else {
+		write_bin($path, $thing);
+	}
+	
+	print("Writing $filename to ${path}...\n");
+	
+	$writes++;
+}
+
+
+sub write_text {
+	my ($outfile, $thing) = @_;
+	
+	open(OUT, ">:utf8", $outfile) or die("Couldn't open $outfile for writing: $!");
+	print OUT $thing;
+	close(OUT) or die("Couldn't close ${outfile}: $!");
+	
+	return();
+}
+
+
+sub write_bin {
+	my ($outfile, $thing) = @_;
+	
+	open(OUT, ">", $outfile) or die("Couldn't open $outfile for writing: $!");
+	binmode(OUT);
+	print OUT $thing;
+	close(OUT) or die("Couldn't close ${outfile}: $!");
+	
+	return();
+}
+
+
+# converts links to relative starting with $base_dir
+sub convert_link {
+	my $link = shift(@_);
+	
+	# dereference if necessary
+	if ( ref($link) ) {
+		$link = $$link;
+	}
+	
+	# SPECIAL CASE: it's one of those fukken @ import links, do something else with it
+	if ( substr($link,0,1) eq '@' ) {
+		substr($link,0,1) = '';
+		return(convert_css_link($link));
+ 	}
+	
+	# is it relative?
+	if ( substr($link,0,1) eq '/' ) {
+		$link =~ s!^/docs/!$base_dir/!i;
+	}
+	else {
+		my $quoted = quotemeta($host);
+		$link =~ s!${quoted}/docs/!$base_dir/!i;
+	}
+	
+	# if it doesn't have a filename extension it's probably a page,
+	# and then we need to tack on .html to the end (fuck internet explorer)
+	# oh and jfs's .lua pages aren't lua scripts either
+	if ( $link !~ m!/.*?\.\w{2,4}$! or (substr($link,-4) eq '.lua') ) {
+			$link = $link . '.html'; 
+	}
+	
+	$link =~ s!\:!_!g; # replace : with _
+	$link =~ s!\?!_!g; # replace ? with _
+	
+	return($link);
+}
+
+
+# HAX
+sub convert_css_link {
+	my $link = shift(@_);
+	
+	# does it seem like css?
+	if ( $link =~ m!MediaWiki:(.+?)\.css!i ) {
+		return(convert_link('/docs/' . $1 . '.css'));
+	}
+	# has a sensible name already, don't fuck with it
+	elsif ( $link =~ m!/(.+?)\.css$!i ) {
+		return(convert_link($link)); 
+	}
+	# doesn't seem like anything useful
+	else { return(undef); }
+}
+
+
+# argh
+sub parse_css {
+	my ($url, $base, $content) = @_;
+	my @links;
+	
+	LINKLIST:
+	while ( $content =~ m!url\(\"(.+?)\"\)!mgi ) {
+		push(@links, URI->new_abs($1, $base));
+	}
+	
+	my $filename = convert_link($url);
+	
+	return($filename, @links);
+}
\ No newline at end of file