Skip to content

Instantly share code, notes, and snippets.

@mrhampson
Created November 9, 2018 20:12
Show Gist options
  • Save mrhampson/c77e7e657e0d84d663c53157848c33c3 to your computer and use it in GitHub Desktop.
Save mrhampson/c77e7e657e0d84d663c53157848c33c3 to your computer and use it in GitHub Desktop.
Programming problem solution. Trying to parse a simple HTML file and then allow the user to query to check if certain nodes exist.
import java.io.File;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.Scanner;
import java.util.Stack;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
class Scratch {
private static final String TAG_REGEX = "<[^>]*>";
private static final Pattern TAG_PATTERN = Pattern.compile(TAG_REGEX);
private static final String QUERY_REGEX = "(\\w*)(\\[(\\w*)=(\\w*)\\])?";
private static final Pattern QUERY_PATTERN = Pattern.compile(QUERY_REGEX);
public static void main(String[] args) throws IOException, TagValidationException {
List<String> lines = Arrays.asList(
"<html><body>",
"<div><div><div id=123>",
"asdf",
"</div></div></div>",
"</body></html>"
);
DomNode root = parseFileLines(lines);
Scanner scanner = new Scanner(System.in);
System.out.println("Query: ");
while (true) {
String query = scanner.next();
if ("q".equals(query)) {
System.exit(0);
}
long start = System.nanoTime();
boolean found = query(query, root);
long end = System.nanoTime();
System.out.println(found ? "found" : "not found");
System.out.println("Query took: " + (end - start) + " ns");
}
}
/**
* Performs a query on the DOM tree
* It currently supports queries such as
* html.body.div.div[id=123]
*
* @param queryString the query string
* @param root the root of the dom tree
* @return true if the node was found, false otherwise
*/
private static boolean query(String queryString, DomNode root) {
String[] queryElements = queryString.split("\\.");
if (root.getTag().getTagType().equals(queryElements[0])) {
DomNode currentNode = root;
for (int i = 1; i < queryElements.length; i++) {
Matcher queryMatcher = QUERY_PATTERN.matcher(queryElements[i]);
String tagType;
Map<String, String> attributesToMatch = null;
if (queryMatcher.matches()) {
tagType = queryMatcher.group(1);
if (queryMatcher.group(3) != null && queryMatcher.group(4) != null) {
attributesToMatch = new HashMap<>();
attributesToMatch.put(queryMatcher.group(3), queryMatcher.group(4));
}
}
else {
return false;
}
DomNode matchedNode = currentNode.getChildren().stream()
.filter(child -> child.getTag().getTagType().equals(tagType))
.findAny()
.orElse(null);
if (matchedNode == null) {
return false;
}
if (attributesToMatch != null) {
for (Map.Entry<String, String> entry : attributesToMatch.entrySet()) {
String actualValue = matchedNode.getTag().getAttributeMap().get(entry.getKey());
if (actualValue == null || !actualValue.equals(entry.getValue())) {
return false;
}
}
}
currentNode = matchedNode;
}
return true;
}
return false;
}
/**
* Creates a DOM tree from the lines of an html file
* @param lines the file lines
* @return a DOM tree representation of the html file
* @throws TagValidationException if there was an invalid tag found
*/
private static DomNode parseFileLines(List<String> lines) throws TagValidationException {
DomNode firstNode = null;
Stack<DomNode> tagStack = new Stack<>();
long lineCounter = 0;
for (String line : lines) {
Matcher matcher = TAG_PATTERN.matcher(line);
while(matcher.find()) {
String tagString = matcher.group();
HtmlTag htmlTag = null;
try {
htmlTag = HtmlTag.fromTagString(tagString);
}
catch (TagValidationException e) {
System.out.println(e.getMessage());
System.out.println("Raw tag: " + tagString);
System.out.println("Line number: " + lineCounter);
throw e;
}
if (htmlTag.isClosingTag()) {
HtmlTag lastTag = tagStack.peek().getTag();
if (!HtmlTag.isOpenClosePair(lastTag, htmlTag)) {
throw new IllegalArgumentException("Found unmatched tag on line: " + lineCounter);
}
else {
tagStack.pop();
}
}
else {
DomNode newNode = new DomNode(htmlTag);
if (firstNode == null) {
firstNode = newNode;
}
else {
DomNode parent = tagStack.peek();
parent.getChildren().add(newNode);
}
tagStack.push(newNode);
}
}
lineCounter++;
}
return firstNode;
}
/**
* A class to represent an HtmlTag
* Currently stores the type of tag, it's attribute map and whether it was opening or closing
*/
private static final class HtmlTag {
private final String tagType;
private final boolean isOpeningTag;
private final Map<String, String> attributeMap;
private HtmlTag(String tagType, boolean isOpeningTag, Map<String, String> attributeMap) {
this.tagType = tagType;
this.isOpeningTag = isOpeningTag;
this.attributeMap = attributeMap;
}
public String getTagType() {
return tagType;
}
public boolean isOpeningTag() {
return isOpeningTag;
}
public boolean isClosingTag() {
return !isOpeningTag;
}
public Map<String, String> getAttributeMap() {
return attributeMap;
}
@Override
public boolean equals(Object o) {
if (this == o)
return true;
if (o == null || getClass() != o.getClass())
return false;
HtmlTag htmlTag = (HtmlTag) o;
return isOpeningTag == htmlTag.isOpeningTag && Objects.equals(tagType, htmlTag.tagType) && Objects
.equals(attributeMap, htmlTag.attributeMap);
}
@Override
public int hashCode() {
return Objects.hash(tagType, isOpeningTag, attributeMap);
}
@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append('<');
if (!isOpeningTag) {
sb.append('/');
}
sb.append(tagType);
if (!attributeMap.isEmpty()) {
sb.append(' ');
for (Map.Entry<String, String> entry : attributeMap.entrySet()) {
sb.append(entry.getKey());
sb.append('=');
sb.append(entry.getValue());
sb.append(' ');
}
}
sb.append('>');
return sb.toString();
}
public static HtmlTag fromTagString(String tagString) throws TagValidationException {
String trimmedTagString = tagString.substring(1, tagString.length() - 1);
String[] words = trimmedTagString.split("\\s+");
if (words.length < 1) {
throw new TagValidationException("Bad tag string");
}
// Plain tag case
if (words.length == 1) {
String rawTag = words[0];
if (rawTag.length() < 2) {
throw new TagValidationException("Invalid tag");
}
boolean isOpeningTag = rawTag.charAt(1) != '\\';
String tagType = isOpeningTag ? rawTag : rawTag.substring(1);
return new HtmlTag(tagType, isOpeningTag, Collections.emptyMap());
}
else {
Map<String, String> attributesMap = new HashMap<>();
String tagType = words[0];
for (int i = 1; i < words.length; i++) {
String attribute = words[i];
String[] attrKeyValue = attribute.split("=");
if (attrKeyValue.length != 2) {
throw new TagValidationException("Invalid attribute");
}
attributesMap.put(attrKeyValue[0], attrKeyValue[1]);
}
return new HtmlTag(tagType, true, Collections.unmodifiableMap(attributesMap));
}
}
public static boolean isOpenClosePair(HtmlTag tag1, HtmlTag tag2) {
return tag1.isOpeningTag ^ tag2.isOpeningTag &&
tag1.tagType.equals(tag2.tagType);
}
}
/**
* A class to represent DomNodes
* It contains a tag as well as it's links to any child tags
*/
private static final class DomNode {
private final HtmlTag tag;
private final List<DomNode> children = new ArrayList<>();
public DomNode(HtmlTag tag) {
this.tag = tag;
}
public HtmlTag getTag() {
return tag;
}
public List<DomNode> getChildren() {
return children;
}
}
/**
* An Exception thrown if the tag could not be parsed from the string
*/
private static final class TagValidationException extends Exception {
public TagValidationException(String msg) {
super(msg);
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment