etherpad-lite/node/utils/ExportHtml.js

545 lines
13 KiB
JavaScript
Raw Normal View History

2011-07-06 14:57:07 +02:00
/**
* Copyright 2009 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS-IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
2011-08-10 18:31:20 +02:00
2011-07-06 14:57:07 +02:00
var async = require("async");
2011-07-27 19:52:23 +02:00
var Changeset = require("./Changeset");
var padManager = require("../db/PadManager");
var ERR = require("async-stacktrace");
2011-07-06 14:57:07 +02:00
2011-08-10 18:31:20 +02:00
function getPadPlainText(pad, revNum)
{
var atext = ((revNum !== undefined) ? pad.getInternalRevisionAText(revNum) : pad.atext());
var textLines = atext.text.slice(0, -1).split('\n');
2011-07-06 14:57:07 +02:00
var attribLines = Changeset.splitAttributionLines(atext.attribs, atext.text);
var apool = pad.pool();
var pieces = [];
2011-08-10 18:31:20 +02:00
for (var i = 0; i < textLines.length; i++)
{
2011-07-06 14:57:07 +02:00
var line = _analyzeLine(textLines[i], attribLines[i], apool);
2011-08-10 18:31:20 +02:00
if (line.listLevel)
{
var numSpaces = line.listLevel * 2 - 1;
2011-07-06 14:57:07 +02:00
var bullet = '*';
2011-08-10 18:31:20 +02:00
pieces.push(new Array(numSpaces + 1).join(' '), bullet, ' ', line.text, '\n');
2011-07-06 14:57:07 +02:00
}
2011-08-10 18:31:20 +02:00
else
{
2011-07-06 14:57:07 +02:00
pieces.push(line.text, '\n');
}
}
return pieces.join('');
}
2011-08-10 18:31:20 +02:00
function getPadHTML(pad, revNum, callback)
{
2011-07-06 14:57:07 +02:00
var atext = pad.atext;
var html;
async.waterfall([
2011-08-10 18:31:20 +02:00
// fetch revision atext
function (callback)
{
if (revNum != undefined)
{
pad.getInternalRevisionAText(revNum, function (err, revisionAtext)
{
if(ERR(err, callback)) return;
2011-08-10 18:31:20 +02:00
atext = revisionAtext;
callback();
2011-08-10 18:31:20 +02:00
});
2011-07-06 14:57:07 +02:00
}
2011-08-10 18:31:20 +02:00
else
{
callback(null);
2011-07-06 14:57:07 +02:00
}
2011-08-10 18:31:20 +02:00
},
// convert atext to html
function (callback)
{
html = getHTMLFromAtext(pad, atext);
callback(null);
}],
// run final callback
function (err)
{
if(ERR(err, callback)) return;
callback(null, html);
2011-08-10 18:31:20 +02:00
});
2011-07-06 14:57:07 +02:00
}
exports.getPadHTML = getPadHTML;
2011-08-10 18:31:20 +02:00
function getHTMLFromAtext(pad, atext)
{
2011-07-06 14:57:07 +02:00
var apool = pad.apool();
2011-08-10 18:31:20 +02:00
var textLines = atext.text.slice(0, -1).split('\n');
2011-07-06 14:57:07 +02:00
var attribLines = Changeset.splitAttributionLines(atext.attribs, atext.text);
2011-08-10 18:31:20 +02:00
var tags = ['h1', 'h2', 'strong', 'em', 'u', 's'];
var props = ['heading1', 'heading2', 'bold', 'italic', 'underline', 'strikethrough'];
2011-07-06 14:57:07 +02:00
var anumMap = {};
2011-08-10 18:31:20 +02:00
props.forEach(function (propName, i)
{
var propTrueNum = apool.putAttrib([propName, true], true);
if (propTrueNum >= 0)
{
2011-07-06 14:57:07 +02:00
anumMap[propTrueNum] = i;
}
});
2011-08-10 18:31:20 +02:00
function getLineHTML(text, attribs)
{
2011-07-06 14:57:07 +02:00
var propVals = [false, false, false];
var ENTER = 1;
var STAY = 2;
var LEAVE = 0;
// Use order of tags (b/i/u) as order of nesting, for simplicity
// and decent nesting. For example,
// <b>Just bold<b> <b><i>Bold and italics</i></b> <i>Just italics</i>
// becomes
// <b>Just bold <i>Bold and italics</i></b> <i>Just italics</i>
var taker = Changeset.stringIterator(text);
var assem = Changeset.stringAssembler();
var openTags = [];
2011-08-10 18:31:20 +02:00
function emitOpenTag(i)
{
openTags.unshift(i);
2011-07-06 14:57:07 +02:00
assem.append('<');
assem.append(tags[i]);
assem.append('>');
}
2011-08-10 18:31:20 +02:00
function emitCloseTag(i)
{
openTags.shift();
2011-07-06 14:57:07 +02:00
assem.append('</');
assem.append(tags[i]);
assem.append('>');
}
function orderdCloseTags(tags2close)
{
for(var i=0;i<openTags.length;i++)
{
for(var j=0;j<tags2close.length;j++)
{
if(tags2close[j] == openTags[i])
{
emitCloseTag(tags2close[j]);
i--;
break;
}
}
}
}
2011-07-06 14:57:07 +02:00
var urls = _findURLs(text);
var idx = 0;
2011-08-10 18:31:20 +02:00
function processNextChars(numChars)
{
if (numChars <= 0)
{
2011-07-06 14:57:07 +02:00
return;
}
2011-08-10 18:31:20 +02:00
var iter = Changeset.opIterator(Changeset.subattribution(attribs, idx, idx + numChars));
2011-07-06 14:57:07 +02:00
idx += numChars;
2011-08-10 18:31:20 +02:00
while (iter.hasNext())
{
2011-07-06 14:57:07 +02:00
var o = iter.next();
var propChanged = false;
2011-08-10 18:31:20 +02:00
Changeset.eachAttribNumber(o.attribs, function (a)
{
if (a in anumMap)
{
2011-07-06 14:57:07 +02:00
var i = anumMap[a]; // i = 0 => bold, etc.
2011-08-10 18:31:20 +02:00
if (!propVals[i])
{
2011-07-06 14:57:07 +02:00
propVals[i] = ENTER;
propChanged = true;
}
2011-08-10 18:31:20 +02:00
else
{
2011-07-06 14:57:07 +02:00
propVals[i] = STAY;
}
}
});
2011-08-10 18:31:20 +02:00
for (var i = 0; i < propVals.length; i++)
{
if (propVals[i] === true)
{
2011-07-06 14:57:07 +02:00
propVals[i] = LEAVE;
propChanged = true;
}
2011-08-10 18:31:20 +02:00
else if (propVals[i] === STAY)
{
2011-07-06 14:57:07 +02:00
propVals[i] = true; // set it back
}
}
// now each member of propVal is in {false,LEAVE,ENTER,true}
// according to what happens at start of span
2011-08-10 18:31:20 +02:00
if (propChanged)
{
2011-07-06 14:57:07 +02:00
// leaving bold (e.g.) also leaves italics, etc.
var left = false;
2011-08-10 18:31:20 +02:00
for (var i = 0; i < propVals.length; i++)
{
2011-07-06 14:57:07 +02:00
var v = propVals[i];
2011-08-10 18:31:20 +02:00
if (!left)
{
if (v === LEAVE)
{
2011-07-06 14:57:07 +02:00
left = true;
}
}
2011-08-10 18:31:20 +02:00
else
{
if (v === true)
{
2011-07-06 14:57:07 +02:00
propVals[i] = STAY; // tag will be closed and re-opened
}
}
}
var tags2close = [];
2011-08-10 18:31:20 +02:00
for (var i = propVals.length - 1; i >= 0; i--)
{
if (propVals[i] === LEAVE)
{
//emitCloseTag(i);
tags2close.push(i);
2011-07-06 14:57:07 +02:00
propVals[i] = false;
}
2011-08-10 18:31:20 +02:00
else if (propVals[i] === STAY)
{
//emitCloseTag(i);
tags2close.push(i);
2011-07-06 14:57:07 +02:00
}
}
orderdCloseTags(tags2close);
2011-08-10 18:31:20 +02:00
for (var i = 0; i < propVals.length; i++)
{
if (propVals[i] === ENTER || propVals[i] === STAY)
{
2011-07-06 14:57:07 +02:00
emitOpenTag(i);
propVals[i] = true;
}
}
// propVals is now all {true,false} again
} // end if (propChanged)
var chars = o.chars;
2011-08-10 18:31:20 +02:00
if (o.lines)
{
2011-07-06 14:57:07 +02:00
chars--; // exclude newline at end of line, if present
}
2011-07-06 14:57:07 +02:00
var s = taker.take(chars);
//removes the characters with the code 12. Don't know where they come
//from but they break the abiword parser and are completly useless
s = s.replace(String.fromCharCode(12), "");
2011-07-06 14:57:07 +02:00
assem.append(_escapeHTML(s));
} // end iteration over spans in line
var tags2close = [];
2011-08-10 18:31:20 +02:00
for (var i = propVals.length - 1; i >= 0; i--)
{
if (propVals[i])
{
tags2close.push(i);
2011-07-06 14:57:07 +02:00
propVals[i] = false;
}
}
orderdCloseTags(tags2close);
2011-07-06 14:57:07 +02:00
} // end processNextChars
2011-08-10 18:31:20 +02:00
if (urls)
{
urls.forEach(function (urlData)
{
2011-07-06 14:57:07 +02:00
var startIndex = urlData[0];
var url = urlData[1];
var urlLength = url.length;
processNextChars(startIndex - idx);
2011-08-10 18:31:20 +02:00
assem.append('<a href="' + url.replace(/\"/g, '&quot;') + '">');
2011-07-06 14:57:07 +02:00
processNextChars(urlLength);
assem.append('</a>');
});
}
processNextChars(text.length - idx);
return _processSpaces(assem.toString());
} // end getLineHTML
var pieces = [];
// Need to deal with constraints imposed on HTML lists; can
// only gain one level of nesting at once, can't change type
// mid-list, etc.
// People might use weird indenting, e.g. skip a level,
// so we want to do something reasonable there. We also
// want to deal gracefully with blank lines.
var lists = []; // e.g. [[1,'bullet'], [3,'bullet'], ...]
2011-08-10 18:31:20 +02:00
for (var i = 0; i < textLines.length; i++)
{
2011-07-06 14:57:07 +02:00
var line = _analyzeLine(textLines[i], attribLines[i], apool);
var lineContent = getLineHTML(line.text, line.aline);
2011-08-10 18:31:20 +02:00
if (line.listLevel || lists.length > 0)
{
2011-07-06 14:57:07 +02:00
// do list stuff
var whichList = -1; // index into lists or -1
2011-08-10 18:31:20 +02:00
if (line.listLevel)
{
2011-07-06 14:57:07 +02:00
whichList = lists.length;
2011-08-10 18:31:20 +02:00
for (var j = lists.length - 1; j >= 0; j--)
{
if (line.listLevel <= lists[j][0])
{
2011-07-06 14:57:07 +02:00
whichList = j;
}
}
}
2011-08-10 18:31:20 +02:00
if (whichList >= lists.length)
{
2011-07-06 14:57:07 +02:00
lists.push([line.listLevel, line.listTypeName]);
pieces.push('<ul><li>', lineContent || '<br>');
2011-07-06 14:57:07 +02:00
}
2011-08-10 18:31:20 +02:00
else if (whichList == -1)
{
if (line.text)
{
2011-07-06 14:57:07 +02:00
// non-blank line, end all lists
2011-08-10 18:31:20 +02:00
pieces.push(new Array(lists.length + 1).join('</li></ul\n>'));
2011-07-06 14:57:07 +02:00
lists.length = 0;
pieces.push(lineContent, '<br>');
2011-07-06 14:57:07 +02:00
}
2011-08-10 18:31:20 +02:00
else
{
pieces.push('<br><br>');
2011-07-06 14:57:07 +02:00
}
}
2011-08-10 18:31:20 +02:00
else
{
while (whichList < lists.length - 1)
{
pieces.push('</li></ul>');
2011-07-06 14:57:07 +02:00
lists.length--;
}
pieces.push('</li><li>', lineContent || '<br>');
2011-07-06 14:57:07 +02:00
}
}
2011-08-10 18:31:20 +02:00
else
{
pieces.push(lineContent, '<br>');
2011-07-06 14:57:07 +02:00
}
}
2011-08-10 18:31:20 +02:00
pieces.push(new Array(lists.length + 1).join('</li></ul>'));
2011-07-06 14:57:07 +02:00
return pieces.join('');
}
2011-08-10 18:31:20 +02:00
function _analyzeLine(text, aline, apool)
{
2011-07-06 14:57:07 +02:00
var line = {};
// identify list
var lineMarker = 0;
line.listLevel = 0;
2011-08-10 18:31:20 +02:00
if (aline)
{
2011-07-06 14:57:07 +02:00
var opIter = Changeset.opIterator(aline);
2011-08-10 18:31:20 +02:00
if (opIter.hasNext())
{
2011-07-06 14:57:07 +02:00
var listType = Changeset.opAttributeValue(opIter.next(), 'list', apool);
2011-08-10 18:31:20 +02:00
if (listType)
{
2011-07-06 14:57:07 +02:00
lineMarker = 1;
listType = /([a-z]+)([12345678])/.exec(listType);
2011-08-10 18:31:20 +02:00
if (listType)
{
2011-07-06 14:57:07 +02:00
line.listTypeName = listType[1];
line.listLevel = Number(listType[2]);
}
}
}
}
2011-08-10 18:31:20 +02:00
if (lineMarker)
{
2011-07-06 14:57:07 +02:00
line.text = text.substring(1);
line.aline = Changeset.subattribution(aline, 1);
}
2011-08-10 18:31:20 +02:00
else
{
2011-07-06 14:57:07 +02:00
line.text = text;
line.aline = aline;
}
return line;
}
2011-08-10 18:31:20 +02:00
exports.getPadHTMLDocument = function (padId, revNum, noDocType, callback)
{
padManager.getPad(padId, function (err, pad)
2011-07-08 19:33:01 +02:00
{
if(ERR(err, callback)) return;
2011-08-10 18:31:20 +02:00
var head = (noDocType ? '' : '<!doctype html>\n') + '<html lang="en">\n' + (noDocType ? '' : '<head>\n' + '<meta charset="utf-8">\n' + '<style> * { font-family: arial, sans-serif;\n' + 'font-size: 13px;\n' + 'line-height: 17px; }</style>\n' + '</head>\n') + '<body>';
2011-07-06 14:57:07 +02:00
2011-07-08 19:33:01 +02:00
var foot = '</body>\n</html>\n';
2011-07-06 14:57:07 +02:00
2011-08-10 18:31:20 +02:00
getPadHTML(pad, revNum, function (err, html)
{
if(ERR(err, callback)) return;
callback(null, head + html + foot);
2011-07-08 19:33:01 +02:00
});
2011-07-06 14:57:07 +02:00
});
}
2011-08-10 18:31:20 +02:00
function _escapeHTML(s)
{
2011-07-06 14:57:07 +02:00
var re = /[&<>]/g;
2011-08-10 18:31:20 +02:00
if (!re.MAP)
{
2011-07-06 14:57:07 +02:00
// persisted across function calls!
re.MAP = {
'&': '&amp;',
'<': '&lt;',
'>': '&gt;',
};
}
2011-08-10 18:48:36 +02:00
s = s.replace(re, function (c)
2011-08-10 18:31:20 +02:00
{
return re.MAP[c];
});
2011-08-10 18:48:36 +02:00
return s.replace(/[^\x21-\x7E\s\t\n\r]/g, function(c)
{
return "&#" +c.charCodeAt(0) + ";"
});
2011-07-06 14:57:07 +02:00
}
// copied from ACE
2011-08-10 18:31:20 +02:00
function _processSpaces(s)
{
2011-07-06 14:57:07 +02:00
var doesWrap = true;
2011-08-10 18:31:20 +02:00
if (s.indexOf("<") < 0 && !doesWrap)
{
2011-07-06 14:57:07 +02:00
// short-cut
return s.replace(/ /g, '&nbsp;');
}
var parts = [];
2011-08-10 18:31:20 +02:00
s.replace(/<[^>]*>?| |[^ <]+/g, function (m)
{
parts.push(m);
});
if (doesWrap)
{
2011-07-06 14:57:07 +02:00
var endOfLine = true;
var beforeSpace = false;
// last space in a run is normal, others are nbsp,
// end of line is nbsp
2011-08-10 18:31:20 +02:00
for (var i = parts.length - 1; i >= 0; i--)
{
2011-07-06 14:57:07 +02:00
var p = parts[i];
2011-08-10 18:31:20 +02:00
if (p == " ")
{
if (endOfLine || beforeSpace) parts[i] = '&nbsp;';
endOfLine = false;
beforeSpace = true;
2011-07-06 14:57:07 +02:00
}
2011-08-10 18:31:20 +02:00
else if (p.charAt(0) != "<")
{
endOfLine = false;
beforeSpace = false;
2011-07-06 14:57:07 +02:00
}
}
// beginning of line is nbsp
2011-08-10 18:31:20 +02:00
for (var i = 0; i < parts.length; i++)
{
2011-07-06 14:57:07 +02:00
var p = parts[i];
2011-08-10 18:31:20 +02:00
if (p == " ")
{
parts[i] = '&nbsp;';
break;
2011-07-06 14:57:07 +02:00
}
2011-08-10 18:31:20 +02:00
else if (p.charAt(0) != "<")
{
break;
2011-07-06 14:57:07 +02:00
}
}
}
2011-08-10 18:31:20 +02:00
else
{
for (var i = 0; i < parts.length; i++)
{
2011-07-06 14:57:07 +02:00
var p = parts[i];
2011-08-10 18:31:20 +02:00
if (p == " ")
{
parts[i] = '&nbsp;';
2011-07-06 14:57:07 +02:00
}
}
}
return parts.join('');
}
// copied from ACE
var _REGEX_WORDCHAR = /[\u0030-\u0039\u0041-\u005A\u0061-\u007A\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u00FF\u0100-\u1FFF\u3040-\u9FFF\uF900-\uFDFF\uFE70-\uFEFE\uFF10-\uFF19\uFF21-\uFF3A\uFF41-\uFF5A\uFF66-\uFFDC]/;
var _REGEX_SPACE = /\s/;
2011-08-10 18:31:20 +02:00
var _REGEX_URLCHAR = new RegExp('(' + /[-:@a-zA-Z0-9_.,~%+\/\\?=&#;()$]/.source + '|' + _REGEX_WORDCHAR.source + ')');
var _REGEX_URL = new RegExp(/(?:(?:https?|s?ftp|ftps|file|smb|afp|nfs|(x-)?man|gopher|txmt):\/\/|mailto:)/.source + _REGEX_URLCHAR.source + '*(?![:.,;])' + _REGEX_URLCHAR.source, 'g');
2011-07-06 14:57:07 +02:00
// returns null if no URLs, or [[startIndex1, url1], [startIndex2, url2], ...]
2011-08-10 18:31:20 +02:00
function _findURLs(text)
{
2011-07-06 14:57:07 +02:00
_REGEX_URL.lastIndex = 0;
var urls = null;
var execResult;
2011-08-10 18:31:20 +02:00
while ((execResult = _REGEX_URL.exec(text)))
{
2011-07-06 14:57:07 +02:00
urls = (urls || []);
var startIndex = execResult.index;
var url = execResult[0];
urls.push([startIndex, url]);
}
return urls;
}