Initial commit

This commit is contained in:
Niles Rogoff 2016-06-29 23:05:31 -04:00
commit b86682931f
3 changed files with 109 additions and 0 deletions

35
index.pyhtml Normal file
View File

@ -0,0 +1,35 @@
<%!
import BTEdb, time
%>
<%
db = BTEdb.Database("/dev/shm/lainchan-scraper.json")
dumped = db.Dump("table")
dumped.sort(key = lambda x: x["time"])
%>
<!doctype html>
<html>
<head>
<title>Lainchan scraper for IPFS IDs</title>
<link rel="stylesheet" href="/style.css" />
<link rel="stylesheet" href="https://lainchan.org/stylesheets/code/dark.css" />
<script type="text/javascript" src="https://lainchan.org/js/jquery.min.js"></script>
<script type="text/javascript" src="https://lainchan.org/js/prettify.js"></script>
<script type="text/javascript" src="https://lainchan.org/js/hilight.js"></script>
</head>
<body>
<h1>IPFS Scraper</h1>
<div id="content">
% for post in dumped[::-1]:
<div class="post">
<a class="postlink" href="https://lainchan.org/${post["board"]}/res/${post["parent_thread_id"]}.html#q${post["post"]}" target="_blank">Post</a> -
<a class="ipfslink-public" href="https://ipfs.io/ipfs/${post["match"]}">public</a>
<a class="ipfslink-local" href="http://localhost:8080/ipfs/${post["match"]}">local</a> -
${post["match"]}
<span class="date">${time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(post["time"]))}</span>
<div class="embedded">${post["text"].replace(u"\u03bb", "lam").replace('href="/' + post["board"], 'href="https://lainchan.org/' + post["board"])}</div>
</div>
% endfor
</div>
<div id="footer">Niles Rogoff 2016</div>
</body>
</html>

29
scraper.py Normal file
View File

@ -0,0 +1,29 @@
import urllib.request, json, BTEdb, re, time
db = BTEdb.Database("/dev/shm/lainchan-scraper.json")
db.BeginTransaction(False)
boards = ['lam', "tech"]
regex = re.compile(r"\b[A-Za-z0-9]{46}\b")
if db.TableExists("table"):
db.Truncate("table")
else:
db.CreateTable("table")
for board in boards:
threads = json.loads(urllib.request.urlopen("https://lainchan.org/"+board+"/threads.json").read().decode("utf-8"))
for page in threads:
for thread in page["threads"]:
print(thread["no"])
time.sleep(5)
for post in json.loads(urllib.request.urlopen("https://lainchan.org/" + board + "/res/" + str(thread["no"]) + ".json").read().decode("utf-8"))["posts"]:
if "com" in post:
result = re.search(regex, post["com"])
if result:
i = 0
while True:
try:
db.Insert("table", board = board, match = result.group(i), parent_thread_id = thread["no"], time = post["time"], text = post["com"], post = post["no"])
print(post["com"])
except:
break
i+= 1
db.CommitTransaction()
db.Destroy()

45
style.css Normal file
View File

@ -0,0 +1,45 @@
body {
background-color: #efeff0;
font-family: sans-serif;
/*color: #eeeeee;*/
}
h1 {
text-align: center;
}
a {
/*color: #1d1f21;*/
color: #aa0000;
text-decoration: none;
}
a:hover {
text-decoration:underline;
}
#content {
padding: 50px;
}
.embedded {
background-color: rgba(0,0,0.2);
border: 1px solid black;
border-radius: 10px;
padding: 10px;
margin-bottom: 20px;
}
.quote {
color: darkgreen;
}
.date {
text-align: right;
float: right;
}
#footer {
position: fixed;
bottom: 20px;
right: 20px;
font-size: .8em;
font-family: monospace;
color: grey;
background-color: inherit;
border-radius: 2px;
padding: 2px;
z-index: 100;
}