Proof-of-concept setup
This commit is contained in:
commit
e89a354463
15
Dockerfile
Normal file
15
Dockerfile
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
FROM selenium/standalone-chrome:3
|
||||||
|
|
||||||
|
# Install packages
|
||||||
|
RUN sudo apt-get update && sudo apt-get install -y python3-pip
|
||||||
|
RUN sudo pip3 install selenium Flask
|
||||||
|
|
||||||
|
# Add script
|
||||||
|
ADD metacrawl.py /tmp/metacrawl.py
|
||||||
|
ADD crawlserver.py /tmp/crawlserver.py
|
||||||
|
|
||||||
|
# Set FLASK_APP env var
|
||||||
|
ENV FLASK_APP=/tmp/crawlserver.py
|
||||||
|
|
||||||
|
# Set entrypoint
|
||||||
|
ENTRYPOINT ["flask", "run", "--host=0.0.0.0"]
|
||||||
20
README.md
Normal file
20
README.md
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
# Google search metacrawl
|
||||||
|
|
||||||
|
Proof-of-concept.
|
||||||
|
|
||||||
|
Server exposes a single endpoint `/metacrawl` which takes a search term `q`
|
||||||
|
as argument and returns a JSON response.
|
||||||
|
|
||||||
|
Build and run docker:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ docker build -t metacrawl:0.1
|
||||||
|
$ docker run --rm -it -p 5000:5000 metacrawl:0.1
|
||||||
|
```
|
||||||
|
|
||||||
|
Query server:
|
||||||
|
|
||||||
|
```bash
|
||||||
|
$ curl localhost:5000/metacrawl?q=yolo
|
||||||
|
{"metadescription":"An overused acronym for \"You only live once.\" There is an exception for those who believe in reincarnation or are cats.","metatitle":"Urban Dictionary: YOLO"}
|
||||||
|
```
|
||||||
12
crawlserver.py
Normal file
12
crawlserver.py
Normal file
@ -0,0 +1,12 @@
|
|||||||
|
from metacrawl import google_search_metacrawl
|
||||||
|
from flask import Flask, request, jsonify
|
||||||
|
|
||||||
|
app = Flask(__name__)
|
||||||
|
|
||||||
|
@app.route("/metacrawl")
|
||||||
|
def google_metacrawl():
|
||||||
|
term = request.args.get('q')
|
||||||
|
if term is None:
|
||||||
|
return jsonify({'msg': "No search term provided"})
|
||||||
|
res = google_search_metacrawl(term)
|
||||||
|
return jsonify(res)
|
||||||
20
metacrawl.py
Normal file
20
metacrawl.py
Normal file
@ -0,0 +1,20 @@
|
|||||||
|
from selenium import webdriver
|
||||||
|
|
||||||
|
chrome_options = webdriver.ChromeOptions()
|
||||||
|
chrome_options.add_argument('--no-sandbox')
|
||||||
|
chrome_options.add_argument('--window-size=1420,1080')
|
||||||
|
chrome_options.add_argument('--headless')
|
||||||
|
chrome_options.add_argument('--disable-gpu')
|
||||||
|
driver = webdriver.Chrome(chrome_options=chrome_options)
|
||||||
|
|
||||||
|
def google_search_metacrawl(q):
|
||||||
|
driver.get("https://www.google.no/")
|
||||||
|
search = driver.find_element_by_name("q")
|
||||||
|
search.send_keys(q)
|
||||||
|
search.submit()
|
||||||
|
metatitle = driver.find_element_by_class_name("LC20lb")
|
||||||
|
metadescription = driver.find_element_by_class_name("st")
|
||||||
|
return {
|
||||||
|
'metatitle': metatitle.text,
|
||||||
|
'metadescription': metadescription.text
|
||||||
|
}
|
||||||
Loading…
x
Reference in New Issue
Block a user