001package org.jsoup.safety; 002 003import org.jsoup.helper.Validate; 004import org.jsoup.nodes.Attribute; 005import org.jsoup.nodes.Attributes; 006import org.jsoup.nodes.DataNode; 007import org.jsoup.nodes.Document; 008import org.jsoup.nodes.Element; 009import org.jsoup.nodes.Node; 010import org.jsoup.nodes.TextNode; 011import org.jsoup.parser.ParseErrorList; 012import org.jsoup.parser.Parser; 013import org.jsoup.select.NodeTraversor; 014import org.jsoup.select.NodeVisitor; 015 016import java.util.List; 017 018import static org.jsoup.internal.SharedConstants.DummyUri; 019 020/** 021 The safelist based HTML cleaner. Use to ensure that end-user provided HTML contains only the elements and attributes 022 that you are expecting; no junk, and no cross-site scripting attacks! 023 <p> 024 The HTML cleaner parses the input as HTML and then runs it through a safe-list, so the output HTML can only contain 025 HTML that is allowed by the safelist. 026 </p> 027 <p> 028 It is assumed that the input HTML is a body fragment; the clean methods only pull from the source's body, and the 029 canned safe-lists only allow body contained tags. 030 </p> 031 <p> 032 Rather than interacting directly with a Cleaner object, generally see the {@code clean} methods in {@link org.jsoup.Jsoup}. 033 </p> 034 */ 035public class Cleaner { 036 private final Safelist safelist; 037 038 /** 039 Create a new cleaner, that sanitizes documents using the supplied safelist. 040 @param safelist safe-list to clean with 041 */ 042 public Cleaner(Safelist safelist) { 043 Validate.notNull(safelist); 044 this.safelist = safelist; 045 } 046 047 /** 048 Creates a new, clean document, from the original dirty document, containing only elements allowed by the safelist. 049 The original document is not modified. Only elements from the dirty document's <code>body</code> are used. The 050 OutputSettings of the original document are cloned into the clean document. 051 @param dirtyDocument Untrusted base document to clean. 052 @return cleaned document. 053 */ 054 public Document clean(Document dirtyDocument) { 055 Validate.notNull(dirtyDocument); 056 057 Document clean = Document.createShell(dirtyDocument.baseUri()); 058 copySafeNodes(dirtyDocument.body(), clean.body()); 059 clean.outputSettings(dirtyDocument.outputSettings().clone()); 060 061 return clean; 062 } 063 064 /** 065 Determines if the input document's <b>body</b> is valid, against the safelist. It is considered valid if all the 066 tags and attributes in the input HTML are allowed by the safelist, and that there is no content in the 067 <code>head</code>. 068 <p> 069 This method is intended to be used in a user interface as a validator for user input. Note that regardless of the 070 output of this method, the input document <b>must always</b> be normalized using a method such as 071 {@link #clean(Document)}, and the result of that method used to store or serialize the document before later reuse 072 such as presentation to end users. This ensures that enforced attributes are set correctly, and that any 073 differences between how a given browser and how jsoup parses the input HTML are normalized. 074 </p> 075 <p>Example: 076 <pre>{@code 077 Document inputDoc = Jsoup.parse(inputHtml); 078 Cleaner cleaner = new Cleaner(Safelist.relaxed()); 079 boolean isValid = cleaner.isValid(inputDoc); 080 Document normalizedDoc = cleaner.clean(inputDoc); 081 }</pre> 082 </p> 083 @param dirtyDocument document to test 084 @return true if no tags or attributes need to be removed; false if they do 085 */ 086 public boolean isValid(Document dirtyDocument) { 087 Validate.notNull(dirtyDocument); 088 089 Document clean = Document.createShell(dirtyDocument.baseUri()); 090 int numDiscarded = copySafeNodes(dirtyDocument.body(), clean.body()); 091 return numDiscarded == 0 092 && dirtyDocument.head().childNodes().isEmpty(); // because we only look at the body, but we start from a shell, make sure there's nothing in the head 093 } 094 095 /** 096 Determines if the input document's <b>body HTML</b> is valid, against the safelist. It is considered valid if all 097 the tags and attributes in the input HTML are allowed by the safelist. 098 <p> 099 This method is intended to be used in a user interface as a validator for user input. Note that regardless of the 100 output of this method, the input document <b>must always</b> be normalized using a method such as 101 {@link #clean(Document)}, and the result of that method used to store or serialize the document before later reuse 102 such as presentation to end users. This ensures that enforced attributes are set correctly, and that any 103 differences between how a given browser and how jsoup parses the input HTML are normalized. 104 </p> 105 <p>Example: 106 <pre>{@code 107 Document inputDoc = Jsoup.parse(inputHtml); 108 Cleaner cleaner = new Cleaner(Safelist.relaxed()); 109 boolean isValid = cleaner.isValidBodyHtml(inputHtml); 110 Document normalizedDoc = cleaner.clean(inputDoc); 111 }</pre> 112 </p> 113 @param bodyHtml HTML fragment to test 114 @return true if no tags or attributes need to be removed; false if they do 115 */ 116 public boolean isValidBodyHtml(String bodyHtml) { 117 String baseUri = (safelist.preserveRelativeLinks()) ? DummyUri : ""; // fake base URI to allow relative URLs to remain valid 118 Document clean = Document.createShell(baseUri); 119 Document dirty = Document.createShell(baseUri); 120 ParseErrorList errorList = ParseErrorList.tracking(1); 121 List<Node> nodes = Parser.parseFragment(bodyHtml, dirty.body(), baseUri, errorList); 122 dirty.body().insertChildren(0, nodes); 123 int numDiscarded = copySafeNodes(dirty.body(), clean.body()); 124 return numDiscarded == 0 && errorList.isEmpty(); 125 } 126 127 /** 128 Iterates the input and copies trusted nodes (tags, attributes, text) into the destination. 129 */ 130 private final class CleaningVisitor implements NodeVisitor { 131 private int numDiscarded = 0; 132 private final Element root; 133 private Element destination; // current element to append nodes to 134 135 private CleaningVisitor(Element root, Element destination) { 136 this.root = root; 137 this.destination = destination; 138 } 139 140 @Override public void head(Node source, int depth) { 141 if (source instanceof Element) { 142 Element sourceEl = (Element) source; 143 144 if (safelist.isSafeTag(sourceEl.normalName())) { // safe, clone and copy safe attrs 145 ElementMeta meta = createSafeElement(sourceEl); 146 Element destChild = meta.el; 147 destination.appendChild(destChild); 148 149 numDiscarded += meta.numAttribsDiscarded; 150 destination = destChild; 151 } else if (source != root) { // not a safe tag, so don't add. don't count root against discarded. 152 numDiscarded++; 153 } 154 } else if (source instanceof TextNode) { 155 TextNode sourceText = (TextNode) source; 156 TextNode destText = new TextNode(sourceText.getWholeText()); 157 destination.appendChild(destText); 158 } else if (source instanceof DataNode && safelist.isSafeTag(source.parent().normalName())) { 159 DataNode sourceData = (DataNode) source; 160 DataNode destData = new DataNode(sourceData.getWholeData()); 161 destination.appendChild(destData); 162 } else { // else, we don't care about comments, xml proc instructions, etc 163 numDiscarded++; 164 } 165 } 166 167 @Override public void tail(Node source, int depth) { 168 if (source instanceof Element && safelist.isSafeTag(source.normalName())) { 169 destination = destination.parent(); // would have descended, so pop destination stack 170 } 171 } 172 } 173 174 private int copySafeNodes(Element source, Element dest) { 175 CleaningVisitor cleaningVisitor = new CleaningVisitor(source, dest); 176 NodeTraversor.traverse(cleaningVisitor, source); 177 return cleaningVisitor.numDiscarded; 178 } 179 180 private ElementMeta createSafeElement(Element sourceEl) { 181 Element dest = sourceEl.shallowClone(); // reuses tag, clones attributes and preserves any user data 182 String sourceTag = sourceEl.tagName(); 183 Attributes destAttrs = dest.attributes(); 184 dest.clearAttributes(); // clear all non-internal attributes, ready for safe copy 185 186 int numDiscarded = 0; 187 Attributes sourceAttrs = sourceEl.attributes(); 188 for (Attribute sourceAttr : sourceAttrs) { 189 if (safelist.isSafeAttribute(sourceTag, sourceEl, sourceAttr)) 190 destAttrs.put(sourceAttr); 191 else 192 numDiscarded++; 193 } 194 195 196 Attributes enforcedAttrs = safelist.getEnforcedAttributes(sourceTag); 197 // special case for <a href rel=nofollow>, only apply to external links: 198 if (sourceEl.nameIs("a") && enforcedAttrs.get("rel").equals("nofollow")) { 199 String href = sourceEl.absUrl("href"); 200 String sourceBase = sourceEl.baseUri(); 201 if (!href.isEmpty() && !sourceBase.isEmpty() && href.startsWith(sourceBase)) { // same site, so don't set the nofollow 202 enforcedAttrs.remove("rel"); 203 } 204 } 205 206 destAttrs.addAll(enforcedAttrs); 207 dest.attributes().addAll(destAttrs); // re-attach, if removed in clear 208 return new ElementMeta(dest, numDiscarded); 209 } 210 211 private static class ElementMeta { 212 Element el; 213 int numAttribsDiscarded; 214 215 ElementMeta(Element el, int numAttribsDiscarded) { 216 this.el = el; 217 this.numAttribsDiscarded = numAttribsDiscarded; 218 } 219 } 220 221}