Proof-of-concept setup
This commit is contained in:
commit
e89a354463
15
Dockerfile
Normal file
15
Dockerfile
Normal file
@ -0,0 +1,15 @@
|
||||
FROM selenium/standalone-chrome:3
|
||||
|
||||
# Install packages
|
||||
RUN sudo apt-get update && sudo apt-get install -y python3-pip
|
||||
RUN sudo pip3 install selenium Flask
|
||||
|
||||
# Add script
|
||||
ADD metacrawl.py /tmp/metacrawl.py
|
||||
ADD crawlserver.py /tmp/crawlserver.py
|
||||
|
||||
# Set FLASK_APP env var
|
||||
ENV FLASK_APP=/tmp/crawlserver.py
|
||||
|
||||
# Set entrypoint
|
||||
ENTRYPOINT ["flask", "run", "--host=0.0.0.0"]
|
||||
20
README.md
Normal file
20
README.md
Normal file
@ -0,0 +1,20 @@
|
||||
# Google search metacrawl
|
||||
|
||||
Proof-of-concept.
|
||||
|
||||
Server exposes a single endpoint `/metacrawl` which takes a search term `q`
|
||||
as argument and returns a JSON response.
|
||||
|
||||
Build and run docker:
|
||||
|
||||
```bash
|
||||
$ docker build -t metacrawl:0.1
|
||||
$ docker run --rm -it -p 5000:5000 metacrawl:0.1
|
||||
```
|
||||
|
||||
Query server:
|
||||
|
||||
```bash
|
||||
$ curl localhost:5000/metacrawl?q=yolo
|
||||
{"metadescription":"An overused acronym for \"You only live once.\" There is an exception for those who believe in reincarnation or are cats.","metatitle":"Urban Dictionary: YOLO"}
|
||||
```
|
||||
12
crawlserver.py
Normal file
12
crawlserver.py
Normal file
@ -0,0 +1,12 @@
|
||||
from metacrawl import google_search_metacrawl
|
||||
from flask import Flask, request, jsonify
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
@app.route("/metacrawl")
|
||||
def google_metacrawl():
|
||||
term = request.args.get('q')
|
||||
if term is None:
|
||||
return jsonify({'msg': "No search term provided"})
|
||||
res = google_search_metacrawl(term)
|
||||
return jsonify(res)
|
||||
20
metacrawl.py
Normal file
20
metacrawl.py
Normal file
@ -0,0 +1,20 @@
|
||||
from selenium import webdriver
|
||||
|
||||
chrome_options = webdriver.ChromeOptions()
|
||||
chrome_options.add_argument('--no-sandbox')
|
||||
chrome_options.add_argument('--window-size=1420,1080')
|
||||
chrome_options.add_argument('--headless')
|
||||
chrome_options.add_argument('--disable-gpu')
|
||||
driver = webdriver.Chrome(chrome_options=chrome_options)
|
||||
|
||||
def google_search_metacrawl(q):
|
||||
driver.get("https://www.google.no/")
|
||||
search = driver.find_element_by_name("q")
|
||||
search.send_keys(q)
|
||||
search.submit()
|
||||
metatitle = driver.find_element_by_class_name("LC20lb")
|
||||
metadescription = driver.find_element_by_class_name("st")
|
||||
return {
|
||||
'metatitle': metatitle.text,
|
||||
'metadescription': metadescription.text
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user