Modify aegisub_convert_docs.pl to produce a static mirror suitable for serving on a web site

2012-08-31 12:51:00 -07:00 · 2012-08-31 12:51:00 -07:00 · 598d3d77e6
parent 1c9af767e5
commit 598d3d77e6
1 changed files with 107 additions and 96 deletions
--- a/docs/wiki_convert/aegisub_convert_docs.pl
+++ b/docs/wiki_convert/aegisub_convert_docs.pl
@ -30,12 +30,13 @@ use File::Path;

 # variables
 my $base_dir = '.';
-my $host = 'http://aegisub.cellosoft.com';
-my $docs_base_url = 'http://aegisub.cellosoft.com/docs/';
+my $host = 'http://docs.aegisub.org';
+my $docs_base_url = 'http://docs.aegisub.org/manual/';
 my $agent = LWP::UserAgent->new();
 my @process_pages = ($docs_base_url . 'Main_Page'); # start at the main page
-my %pages = ($base_dir . '/Main_Page.html' => $docs_base_url . "Main_Page");
+my %pages = ($base_dir . '/Main_Page' => $docs_base_url . "Main_Page");
 my %accepted_types = (
+	'application/javascript' => '.js',
 	'image/png' 	=> '.png',
 	'image/jpeg' 	=> '.jpg',
 	'image/gif' 	=> '.gif',
@ -79,16 +80,14 @@ while ( @process_pages ) {
 		next(GETLOOP);
 	}

-	
 	# monstrously ugly hack to handle rewriting of .css filenames
 	my $name;
-	if ( $current_page =~ m!index\.php\?title=MediaWiki\:.+?\.css!i ) {
+	if ( $current_page =~ m!css$!i ) {
 		$name = convert_css_link($current_page);
 	} else {
 		$name = convert_link($current_page);
 	}

-	
 	# if it's html, parse it, grab new links
 	# add them to @process_pages if they're not already in there
 	# then write the modified html to disk
@ -159,14 +158,26 @@ sub parse_and_fix_html {
 	my ($url, $base, $content) = @_;
 	my (@links, @links_to_modify); # list of links to return later

+	# strip RSS etc.
+	$content =~ s!<link rel=[^>]*xml[^>]* />!!gi;
+
+	# kill the topbar
+	$content =~ s!<div id=\"topbar\".*?<\!-- end topbar -->!!s;
+
+	# kill the article/discussion/history thing
+	$content =~ s!<div id=\"p-cactions\".*?</div>!!s;
+
+	# kill the "toolbox" at the bottom left
+	$content =~ s!<div class=\"portlet\" id=\"p-tb\".*?(<\!-- end of the left)!$1!s;
+
+	# kill "recent changes"
+	$content =~ s!<li id=\"n-recentchanges\">.*?</li>!!;

 	# parse HTML
 	my $html = HTML::LinkExtor->new();
-	# $html->utf8_mode(1); # not needed
 	$html->parse($content);
 	$html->eof();

-	
 	# loop over the list of links
 	# all real work is done here
 	LINKLIST:
@ -181,7 +192,7 @@ sub parse_and_fix_html {
 			$href =~ s!\&!\&amp\;!g; # quotemeta kills &amp; things for some reason

 			# skip and kill special or "edit" links
-			if ( $attrs{'href'} =~ m!docs/index\.php\?!i ) {
+			if ( $attrs{'href'} =~ m!index\.php\?!i ) {
 				$content =~ s!<a href=\"${href}\".*?>(.*?)</a>!$1!gi;
 				next(LINKLIST);
 			}
@ -236,10 +247,10 @@ sub parse_and_fix_html {
 			if ( $attrs{'src'} =~ m!^$quoted!i or (substr($attrs{'src'},0,1) eq '/') ) {
 				# "flatten" image links
 				my $flatlink = $attrs{'src'};
-				$flatlink =~ s!/docs/images/.+/(.+?\.(jpg|gif|png))!${base_dir}/images/$1!i;
+				$flatlink =~ s!/manual/images/.+/(.+?\.(jpg|gif|png))!${base_dir}/images/$1!i;
+				$flatlink =~ s!/manual_real/images/.+/(.+?\.(jpg|gif|png))!${base_dir}/images/$1!i;
 				my $quotedsrc = quotemeta($attrs{'src'});
 				$content =~ s!$quotedsrc!$flatlink!;
-				#push(@links_to_modify, $attrs{'src'});
 			}
 			else { next(LINKLIST); }

@ -248,19 +259,6 @@ sub parse_and_fix_html {
 		# else do nothing
 	}

-	
-	# kill the topbar
-	$content =~ s!<div id=\"topbar\".*?<\!-- end topbar -->!!s;
-	
-	# kill the article/discussion/history thing
-	$content =~ s!<div id=\"p-cactions\".*?</div>!!s;
-	
-	# kill the "toolbox" at the bottom left
-	$content =~ s!<div class=\"portlet\" id=\"p-tb\".*?(<\!-- end of the left)!$1!s;
-	
-	# kill "recent changes"
-	$content =~ s!<li id=\"n-recentchanges\">.*?</li>!!;
-	
 	# handle the @import links to get the css right
 	while ( $content =~ m!\@import \"(.+?)\";!mg ) {
 		my $importlink = $1;
@ -282,10 +280,10 @@ sub parse_and_fix_html {
 		$content =~ s!\"$link\"!\"$converted\"!g;
 	}

+	$url =~ s!manual_real!manual!;
+	$url =~ s!http://docs.aegisub.org!!;

-	my $filename = convert_link($url);
-	
-	return($filename, $content, @links);
+	return($url, $content, @links);
 }


@ -293,24 +291,17 @@ sub write_to_disk {
 	my ($path, $type, $thing) = @_;
 	# return() if ( -e $path ); # this was a dumb idea

-	$path =~ m!(.*)/(.*?)\.\w{2,4}$!;
+	$path =~ m!/(.*)/(.*?)$!;
 	my ($tree, $filename) = ($1, $2);

-	# is it an image link?
-	if ( $tree =~ m!\./images/! ) {
-		# hax it
-		$path =~ s!/images/.+/!/images/!i;
-		$tree =~ s!/images.+!/images!i;
-	}
-	
 	# I don't think this is necessary really
 	mkpath($tree) unless ( -e $tree and -d $tree );

 	if ( $type =~ m!^text! ) {
-		write_text($path, $thing);
+		write_text('.' . $path, $thing);
 	}
 	else {
-		write_bin($path, $thing);
+		write_bin('.' . $path, $thing);
 	}

 	print("Writing $filename to ${path}...\n");
@ -355,7 +346,22 @@ sub convert_link {
 	if ( substr($link,0,1) eq '@' ) {
 		substr($link,0,1) = '';
 		return(convert_css_link($link));
- 	}
+	}
+	elsif ($link =~ /\.css$/) {
+		return(convert_css_link($link));
+	}
+	elsif ($link =~ /\.js/) {
+		return(convert_js_link($link));
+	}
+
+	$link =~ s!http://docs.aegisub.org!!;
+	$link =~ s!/manual/images/.+/(.+?\.(jpg|gif|png))!/manual/images/$1!i;
+	$link =~ s!/manual_real/images/.+/(.+?\.(jpg|gif|png))!/manual/images/$1!i;
+
+	$link =~ s!manual_real/skins/.*?!manual/!;
+
+	#print("link: $link\n");
+	return($link);

 	# is it relative?
 	if ( substr($link,0,1) eq '/' ) {
@ -376,7 +382,7 @@ sub convert_link {
 			$link =~ s!^${pagename}!${pagename}.html!;
 		}
 	} elsif ( $link !~ m!/.*?\.\w{2,4}$! or (substr($link,-4) eq '.lua') ) {
-			$link = $link . '.html'; 
+		$link = $link . '.html'; 
 	}

 	$link =~ s!\:!_!g; # replace : with _
@ -385,23 +391,30 @@ sub convert_link {
 	return($link);
 }

-
 # HAX
 sub convert_css_link {
 	my $link = shift(@_);

 	# does it seem like css?
 	if ( $link =~ m!MediaWiki:(.+?)\.css!i ) {
-		return(convert_link('/docs/' . $1 . '.css'));
+		return("/manual/css/$1.css");
 	}
 	# has a sensible name already, don't fuck with it
-	elsif ( $link =~ m!/(.+?)\.css$!i ) {
-		return(convert_link($link)); 
+	elsif ( $link =~ m!/([^/]+?)\.css$!i ) {
+		return("/manual/css/$1.css");
 	}
 	# doesn't seem like anything useful
-	else { return(undef); }
+	else {
+		print("UNKNOWN CSS: $link\n");
+		return(undef);
+	}
 }

+sub convert_js_link {
+	my $link = shift(@_);
+	$link =~ m!/([^/]+\.js)!;
+	return("/manual/js/$1");
+}

 # argh
 sub parse_css {
@ -424,7 +437,5 @@ sub parse_css {
 		push(@links, URI->new_abs($text, $base));
 	}

-	my $filename = convert_link($url);
-	
-	return($filename, @links);
+	return(@links);
 }