package org.jsoup.safety;
import org.jsoup.helper.Validate;
import org.jsoup.nodes.*;
import org.jsoup.parser.Tag;
import java.util.List;
/**
The whitelist based HTML cleaner. Use to ensure that end-user provided HTML contains only the elements and attributes
that you are expecting; no junk, and no cross-site scripting attacks!
<p/>
The HTML cleaner parses the input as HTML and then runs it through a white-list, so the output HTML can only contain
HTML that is allowed by the whitelist.
<p/>
It is assumed that the input HTML is a body fragment; the clean methods only pull from the source's body, and the
canned white-lists only allow body contained tags.
<p/>
Rather than interacting directly with a Cleaner object, generally see the {@code clean} methods in {@link org.jsoup.Jsoup}.
*/
public class Cleaner {
private Whitelist whitelist;
/**
Create a new cleaner, that sanitizes documents using the supplied whitelist.
@param whitelist white-list to clean with
*/
public Cleaner(Whitelist whitelist) {
Validate.notNull(whitelist);
this.whitelist = whitelist;
}
/**
Creates a new, clean document, from the original dirty document, containing only elements allowed by the whitelist.
The original document is not modified. Only elements from the dirt document's <code>body</code> are used.
@param dirtyDocument Untrusted base document to clean.
@return cleaned document.
*/
public Document clean(Document dirtyDocument) {
Validate.notNull(dirtyDocument);
Document clean = Document.createShell(dirtyDocument.baseUri());
copySafeNodes(dirtyDocument.body(), clean.body());
return clean;
}
/**
Dertmines if the input document is valid, against the whitelist. It is considered valid if all the tags and attributes
in the input HTML are allowed by the whitelist.
<p/>
This method can be used as a validator for user input forms. An invalid document will still be cleaned successfully
using the {@link #clean(Document)} document. If using as a validator, it is recommended to still clean the document
to ensure enforced attributes are set correctly, and that the output is tidied.
@param dirtyDocument document to test
@return true if no tags or attributes need to be removed; false if they do
*/
public boolean isValid(Document dirtyDocument) {
Validate.notNull(dirtyDocument);
Document clean = Document.createShell(dirtyDocument.baseUri());
int numDiscarded = copySafeNodes(dirtyDocument.body(), clean.body());
return numDiscarded == 0;
}
/**
Iterates the input and copies trusted nodes (tags, attributes, text) into the destination.
@param source source of HTML
@param dest destination element to copy into
@return number of discarded elements (that were considered unsafe)
*/
private int copySafeNodes(Element source, Element dest) {
List<Node> sourceChildren = source.childNodes();
int numDiscarded = 0;
for (Node sourceChild : sourceChildren) {
if (sourceChild instanceof Element) {
Element sourceEl = (Element) sourceChild;
if (whitelist.isSafeTag(sourceEl.tagName())) { // safe, clone and copy safe attrs
ElementMeta meta = createSafeElement(sourceEl);
Element destChild = meta.el;
dest.appendChild(destChild);
numDiscarded += meta.numAttribsDiscarded;
numDiscarded += copySafeNodes(sourceEl, destChild); // recurs
} else { // not a safe tag, but it may have children (els or text) that are, so recurse
numDiscarded++;
numDiscarded += copySafeNodes(sourceEl, dest);
}
} else if (sourceChild instanceof TextNode) {
TextNode sourceText = (TextNode) sourceChild;
TextNode destText = new TextNode(sourceText.getWholeText(), sourceChild.baseUri());
dest.appendChild(destText);
} // else, we don't care about comments, xml proc instructions, etc
}
return numDiscarded;
}
private ElementMeta createSafeElement(Element sourceEl) {
String sourceTag = sourceEl.tagName();
Attributes destAttrs = new Attributes();
Element dest = new Element(Tag.valueOf(sourceTag), sourceEl.baseUri(), destAttrs);
int numDiscarded = 0;
Attributes sourceAttrs = sourceEl.attributes();
for (Attribute sourceAttr : sourceAttrs) {
if (whitelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr))
destAttrs.put(sourceAttr);
else
numDiscarded++;
}
Attributes enforcedAttrs = whitelist.getEnforcedAttributes(sourceTag);
destAttrs.addAll(enforcedAttrs);
return new ElementMeta(dest, numDiscarded);
}
private static class ElementMeta {
Element el;
int numAttribsDiscarded;
ElementMeta(Element el, int numAttribsDiscarded) {
this.el = el;
this.numAttribsDiscarded = numAttribsDiscarded;
}
}
}
|