commit e89a35446300a5b425abe4b5630908f0ca00f58b Author: Michael Soukup Date: Wed Jan 23 13:37:25 2019 +0000 Proof-of-concept setup diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..46feab0 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,15 @@ +FROM selenium/standalone-chrome:3 + +# Install packages +RUN sudo apt-get update && sudo apt-get install -y python3-pip +RUN sudo pip3 install selenium Flask + +# Add script +ADD metacrawl.py /tmp/metacrawl.py +ADD crawlserver.py /tmp/crawlserver.py + +# Set FLASK_APP env var +ENV FLASK_APP=/tmp/crawlserver.py + +# Set entrypoint +ENTRYPOINT ["flask", "run", "--host=0.0.0.0"] diff --git a/README.md b/README.md new file mode 100644 index 0000000..12173a6 --- /dev/null +++ b/README.md @@ -0,0 +1,20 @@ +# Google search metacrawl + +Proof-of-concept. + +Server exposes a single endpoint `/metacrawl` which takes a search term `q` +as argument and returns a JSON response. + +Build and run docker: + +```bash +$ docker build -t metacrawl:0.1 +$ docker run --rm -it -p 5000:5000 metacrawl:0.1 +``` + +Query server: + +```bash +$ curl localhost:5000/metacrawl?q=yolo +{"metadescription":"An overused acronym for \"You only live once.\" There is an exception for those who believe in reincarnation or are cats.","metatitle":"Urban Dictionary: YOLO"} +``` diff --git a/crawlserver.py b/crawlserver.py new file mode 100644 index 0000000..ec038c5 --- /dev/null +++ b/crawlserver.py @@ -0,0 +1,12 @@ +from metacrawl import google_search_metacrawl +from flask import Flask, request, jsonify + +app = Flask(__name__) + +@app.route("/metacrawl") +def google_metacrawl(): + term = request.args.get('q') + if term is None: + return jsonify({'msg': "No search term provided"}) + res = google_search_metacrawl(term) + return jsonify(res) diff --git a/metacrawl.py b/metacrawl.py new file mode 100644 index 0000000..d4fe0cb --- /dev/null +++ b/metacrawl.py @@ -0,0 +1,20 @@ +from selenium import webdriver + +chrome_options = webdriver.ChromeOptions() +chrome_options.add_argument('--no-sandbox') +chrome_options.add_argument('--window-size=1420,1080') +chrome_options.add_argument('--headless') +chrome_options.add_argument('--disable-gpu') +driver = webdriver.Chrome(chrome_options=chrome_options) + +def google_search_metacrawl(q): + driver.get("https://www.google.no/") + search = driver.find_element_by_name("q") + search.send_keys(q) + search.submit() + metatitle = driver.find_element_by_class_name("LC20lb") + metadescription = driver.find_element_by_class_name("st") + return { + 'metatitle': metatitle.text, + 'metadescription': metadescription.text + }