Merge pull request #2668 from simong/tidy

Tidy HTML before trying to convert it with abiword
This commit is contained in:
John McLear 2015-05-18 20:04:15 +01:00
commit 5615bab0d9
5 changed files with 156 additions and 35 deletions

View File

@ -10,12 +10,12 @@
// favicon default name
// alternatively, set up a fully specified Url to your own favicon
"favicon": "favicon.ico",
//IP and port which etherpad should bind at
"ip": "0.0.0.0",
"port" : 9001,
/*
/*
// Node native SSL support
// this is disabled by default
//
@ -37,17 +37,17 @@
"dbSettings" : {
"filename" : "var/dirty.db"
},
/* An Example of MySQL Configuration
"dbType" : "mysql",
"dbSettings" : {
"user" : "root",
"host" : "localhost",
"password": "",
"user" : "root",
"host" : "localhost",
"password": "",
"database": "store"
},
*/
//the default text of a pad
"defaultPadText" : "Welcome to Etherpad!\n\nThis pad text is synchronized as you type, so that everyone viewing this page sees the same text. This allows you to collaborate seamlessly on documents!\n\nGet involved with Etherpad at http:\/\/etherpad.org\n",
@ -65,7 +65,7 @@
"chatAndUsers": false,
"lang": "en-gb"
},
/* Shoud we suppress errors from being visible in the default Pad Text? */
"suppressErrorsInPadText" : false,
@ -77,35 +77,39 @@
/* Users, who have a valid session, automatically get granted access to password protected pads */
"sessionNoPassword" : false,
/* if true, all css & js will be minified before sending to the client. This will improve the loading performance massivly,
/* if true, all css & js will be minified before sending to the client. This will improve the loading performance massivly,
but makes it impossible to debug the javascript/css */
"minify" : true,
/* How long may clients use served javascript code (in seconds)? Without versioning this
may cause problems during deployment. Set to 0 to disable caching */
"maxAge" : 21600, // 60 * 60 * 6 = 6 hours
/* This is the path to the Abiword executable. Setting it to null, disables abiword.
Abiword is needed to advanced import/export features of pads*/
Abiword is needed to advanced import/export features of pads*/
"abiword" : null,
/* This is the path to the Tidy executable. Setting it to null, disables Tidy.
Tidy is used to improve the quality of exported pads*/
"tidyHtml" : null,
/* Allow import of file types other than the supported types: txt, doc, docx, rtf, odt, html & htm */
"allowUnknownFileEnds" : true,
/* This setting is used if you require authentication of all users.
Note: /admin always requires authentication. */
"requireAuthentication" : false,
/* Require authorization by a module, or a user with is_admin set, see below. */
"requireAuthorization" : false,
/*when you use NginX or another proxy/ load-balancer set this to true*/
"trustProxy" : false,
/* Privacy: disable IP logging */
"disableIPlogging" : false,
"disableIPlogging" : false,
/* Users for basic authentication. is_admin = true gives access to /admin.
If you do not uncomment this, /admin will not be available! */
/*
@ -126,7 +130,7 @@
// Allow Load Testing tools to hit the Etherpad Instance. Warning this will disable security on the instance.
"loadTest": false,
/* The toolbar buttons configuration.
"toolbar": {
"left": [
@ -148,7 +152,7 @@
/* The log level we are using, can be: DEBUG, INFO, WARN, ERROR */
"loglevel": "INFO",
//Logging configuration. See log4js documentation for further information
// https://github.com/nomiddlename/log4js-node
// You can add as many appenders as you want here:

View File

@ -28,6 +28,7 @@ var fs = require("fs");
var settings = require('../utils/Settings');
var os = require('os');
var hooks = require("ep_etherpad-lite/static/js/pluginfw/hooks");
var TidyHtml = require('../utils/TidyHtml');
//load abiword only if its enabled
if(settings.abiword != null)
@ -35,28 +36,28 @@ if(settings.abiword != null)
var tempDirectory = "/tmp";
//tempDirectory changes if the operating system is windows
//tempDirectory changes if the operating system is windows
if(os.type().indexOf("Windows") > -1)
{
tempDirectory = process.env.TEMP;
}
/**
* do a requested export
*/
*/
exports.doExport = function(req, res, padId, type)
{
var fileName = padId;
// allow fileName to be overwritten by a hook, the type type is kept static for security reasons
hooks.aCallFirst("exportFileName", padId,
hooks.aCallFirst("exportFileName", padId,
function(err, hookFileName){
// if fileName is set then set it to the padId, note that fileName is returned as an array.
if(hookFileName.length) fileName = hookFileName;
//tell the browser that this is a downloadable file
res.attachment(fileName + "." + type);
//if this is a plain text export, we can do this directly
// We have to over engineer this because tabs are stored as attributes and not plain text
if(type == "etherpad"){
@ -72,7 +73,7 @@ exports.doExport = function(req, res, padId, type)
var txt;
var randNum;
var srcFile, destFile;
async.series([
//render the txt document
function(callback)
@ -96,7 +97,7 @@ exports.doExport = function(req, res, padId, type)
{
//ensure html can be collected by the garbage collector
txt = null;
destFile = tempDirectory + "/etherpad_export_" + randNum + "." + type;
abiword.convertFile(srcFile, destFile, type, callback);
},
@ -140,7 +141,7 @@ exports.doExport = function(req, res, padId, type)
var html;
var randNum;
var srcFile, destFile;
async.series([
//render the html document
function(callback)
@ -150,7 +151,7 @@ exports.doExport = function(req, res, padId, type)
if(ERR(err, callback)) return;
html = _html;
callback();
});
});
},
//decide what to do with the html export
function(callback)
@ -162,22 +163,29 @@ exports.doExport = function(req, res, padId, type)
hooks.aCallFirst("exportHTMLSend", html, function(err, newHTML){
if(newHTML.length) html = newHTML;
res.send(html);
callback("stop");
callback("stop");
});
}
else //write the html export to a file
{
randNum = Math.floor(Math.random()*0xFFFFFFFF);
srcFile = tempDirectory + "/etherpad_export_" + randNum + ".html";
fs.writeFile(srcFile, html, callback);
fs.writeFile(srcFile, html, callback);
}
},
//send the convert job to abiword
// Tidy up the exported HTML
function(callback)
{
//ensure html can be collected by the garbage collector
html = null;
TidyHtml.tidy(srcFile, callback);
},
//send the convert job to abiword
function(callback)
{
destFile = tempDirectory + "/etherpad_export_" + randNum + "." + type;
abiword.convertFile(srcFile, destFile, type, callback);
},
@ -199,7 +207,7 @@ exports.doExport = function(req, res, padId, type)
//100ms delay to accomidate for slow windows fs
if(os.type().indexOf("Windows") > -1)
{
setTimeout(function()
setTimeout(function()
{
fs.unlink(destFile, callback);
}, 100);

View File

@ -152,6 +152,11 @@ exports.minify = true;
*/
exports.abiword = null;
/**
* The path of the tidy executable
*/
exports.tidyHtml = null;
/**
* Should we support none natively supported file types on import?
*/
@ -167,7 +172,7 @@ exports.loglevel = "INFO";
*/
exports.disableIPlogging = false;
/**
/**
* Disable Load Testing
*/
exports.loadTest = false;
@ -239,7 +244,7 @@ exports.reloadSettings = function reloadSettings() {
} else {
settingsFilename = path.resolve(path.join(exports.root, settingsFilename));
}
var settingsStr;
try{
//read the settings sync

View File

@ -0,0 +1,41 @@
/**
* Tidy up the HTML in a given file
*/
var log4js = require('log4js');
var settings = require('./Settings');
var spawn = require('child_process').spawn;
exports.tidy = function(srcFile, callback) {
var logger = log4js.getLogger('TidyHtml');
// Don't do anything if Tidy hasn't been enabled
if (!settings.tidyHtml) {
logger.debug('tidyHtml has not been configured yet, ignoring tidy request');
return callback(null);
}
var errMessage = '';
// Spawn a new tidy instance that cleans up the file inline
logger.debug('Tidying ' + srcFile);
var tidy = spawn(settings.tidyHtml, ['-modify', srcFile]);
// Keep track of any error messages
tidy.stderr.on('data', function (data) {
errMessage += data.toString();
});
// Wait until Tidy is done
tidy.on('close', function(code) {
// Tidy returns a 0 when no errors occur and a 1 exit code when
// the file could be tidied but a few warnings were generated
if (code === 0 || code === 1) {
logger.debug('Tidied ' + srcFile + ' successfully');
return callback(null);
} else {
logger.error('Failed to tidy ' + srcFile + '\n' + errMessage);
return callback('Tidy died with exit code ' + code);
}
});
};

View File

@ -0,0 +1,63 @@
var assert = require('assert')
fs = require('fs'),
path = require('path'),
TidyHtml = null,
Settings = null;
var npm = require("../../../../src/node_modules/npm/lib/npm.js");
describe('tidyHtml', function() {
before(function(done) {
npm.load({}, function(err) {
assert.ok(!err);
TidyHtml = require('../../../../src/node/utils/TidyHtml');
Settings = require('../../../../src/node/utils/Settings');
return done()
});
});
it('Tidies HTML', function(done) {
// If the user hasn't configured Tidy, we skip this tests as it's required for this test
if (!Settings.tidyHtml) {
this.skip();
}
// Try to tidy up a bad HTML file
var tmpDir = process.env.TEMP || "/tmp";
var tmpFile = path.join(tmpDir, 'tmp_' + (Math.floor(Math.random() * 1000000)) + '.html')
fs.writeFileSync(tmpFile, '<html><body><p>a paragraph</p><li>List without outer UL</li>trailing closing p</p></body></html>');
TidyHtml.tidy(tmpFile, function(err){
assert.ok(!err);
// Read the file again
var cleanedHtml = fs.readFileSync(tmpFile).toString();
var expectedHtml = [
'<title></title>',
'</head>',
'<body>',
'<p>a paragraph</p>',
'<ul>',
'<li>List without outer UL</li>',
'<li style="list-style: none">trailing closing p</li>',
'</ul>',
'</body>',
'</html>',
].join('\n');
assert.notStrictEqual(cleanedHtml.indexOf(expectedHtml), -1);
return done();
});
});
it('can deal with errors', function(done) {
// If the user hasn't configured Tidy, we skip this tests as it's required for this test
if (!Settings.tidyHtml) {
this.skip();
}
TidyHtml.tidy('/some/none/existing/file.html', function(err) {
assert.ok(err);
return done();
});
});
});