Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,13 @@ dependency-reduced-pom.xml
target/
.DS_Store
__pycache__
sol

# Jupyter Notebook
examples/.ipynb_checkpoints
examples/*.ipynb
examples/*html_files*

#Editor
*.vscode
*tar.bz2
3 changes: 2 additions & 1 deletion examples/client.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@ def download():
e = EtherScanIoApi()

gap = 5760 # 1 block each day
start = 3000000
start = 5000000
end = 8000000
#end = 8000000

import random
for block in range(start, end, gap):
Expand Down
171 changes: 21 additions & 150 deletions examples/downloadSCs.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@
contracts.json
"""
from pyetherchain.pyetherchain import UserAgent
from pyetherchain.pyetherchain import EtherChain
import configparser
import re
import requests
import sys
from bs4 import BeautifulSoup
import traceback


class EtherScanIoApi(object):
Expand All @@ -34,106 +34,37 @@ def __init__(self, proxies={}):
self.config.read('config.ini')
self.session = UserAgent(
baseurl="https://etherscan.io", retry=5, retrydelay=8, proxies=proxies)
self.ec = EtherChain()
self.soup = None

def get_contracts_from_block(self, block):

soup = BeautifulSoup(requests.get(
'https://etherscan.io/txs?block=' + str(block)).text, features="html.parser")
addresses = soup.select("i[title='Contract']")

for address in list(set(map(lambda x: x.findNext('a')['href'].replace('/address/', ''), addresses))):
if not self._is_new_address(address):
continue
describe_contract = self.ec.account(address).describe_contract
print(address)
self._set_soup(address)
contract = {'address': address,
'name': self._get_contract_name(),
'compiler': None,
'compiler_version': self._get_compiler_version(),
'balance': describe_contract.__self__['balance'],
'txcount': describe_contract.__self__['txreceived'],
'firstseen': describe_contract.__self__['firstseen'],
'lastseen': describe_contract.__self__['lastseen']
}
yield contract

def get_contracts_from_etherscan(self, start=0, end=None):
page = start

while not end or page <= end:
resp = self.session.get("/contractsVerified/%d" % page).text
page, lastpage = re.findall(
r'Page <.*>(\d+)</.*> of <.*>(\d+)</.*>', resp)[0]
page, lastpage = int(page), int(lastpage)
if not end:
end = lastpage
rows = self._parse_tbodies(resp)[0] # only use first tbody
for col in rows:
address = self._extract_text_from_html(col[0]).split(" ", 1)[0]
if not self._is_new_address(address):
continue
describe_contract = self.ec.account(address).describe_contract
firstseen = describe_contract.__self__['firstseen']
lastseen = describe_contract.__self__['lastseen']
contract = {'address': address,
'name': self._extract_text_from_html(col[1]),
'compiler': self._extract_text_from_html(col[2]),
'compiler_version': self._extract_text_from_html(col[3]),
'balance': self._get_balance(self._extract_text_from_html(col[4])),
'txcount': self._extract_text_from_html(col[5]),
'firstseen': firstseen,
'lastseen': firstseen
}
yield contract
page += 1

def write_etherChain_fn(self, contracts=[]):
amount = 100
for nr, c in enumerate(contracts):
with open(self.config['DEFAULT']['etherChain_fn'], 'a+') as f:
print("got contract: %s" % c)

f_path = os.path.join(
self.config['DEFAULT']['output_path'], '%s.sol' % (c["address"]))
try:
source = self._get_contract_source(c["address"]).strip()
if not len(source):
raise Exception(c)
except Exception as e:
continue

f.write("%s\n" % c)
with open(f_path, "wb") as f:
f.write(bytes(source, "utf8"))

print("[%d/%d] dumped --> %s (%-20s) -> %s" %
(nr, amount, c["address"], c["name"], f_path))
self.write_etherChain_fn(address)
# yield contract

nr += 1
if nr >= amount:
break
def write_etherChain_fn(self, address):
with open(self.config['DEFAULT']['etherChain_fn'], 'a+') as f:
print("got contract: %s" % address)
dir_path = os.path.join(
self.config['DEFAULT']['output_path'] + address[0:4])
f_path = os.path.join(dir_path, '%s.html' % (address))

def _get_contract_source(self, address):
import time
e = None
for _ in range(5):
resp = self.session.get("/address/%s" % address).text
print("/address/%s" % address)
if "You have reached your maximum request limit for this resource. Please try again later" in resp:
print("[[THROTTELING]]")
time.sleep(1+2.5*_)
continue
try:
print("=======================================================")
print(address)
resp = resp.split(
"<pre class='js-sourcecopyarea editor' id='editor' style='margin-top: 5px;'>", 1)[1]
resp = resp.split("</pre><br>", 1)[0]
return resp.replace("&lt;", "<").replace("&gt;", ">").replace("&le;", "<=").replace("&ge;", ">=").replace("&amp;", "&").replace("&vert;", "|")
source = self.soup.find(id="editor").text
abi = self.soup.find(id="js-copytextarea2").text
byteCode = self.soup.find(id="verifiedbytecode2").text
os.makedirs(dir_path, exist_ok=True)
with open(f_path, "wb") as f:
f.write(bytes(source + '\n### ABI:\n' + abi +
'\n### byteCode:\n' + byteCode, "utf8"))
except:
print(traceback.format_exc())
time.sleep(1 + 2.5 * _)
break

def _is_new_address(self, address):
if (address not in open(self.config['DEFAULT']['smec_fn']).read()):
Expand All @@ -144,67 +75,7 @@ def _set_soup(self, address):
url = address.join(['https://etherscan.io/address/', '#code'])
self.soup = BeautifulSoup(requests.get(url).text, 'html.parser')

def _get_compiler_version(self):
try:
str = self.soup.findAll('span', text=re.compile('v0.'))[
0].contents[0]
return re.search('v(\d{1,2}.\d{1,2}.\d{1,2})', str)[1]
except IndexError:
return None

def _get_contract_name(self):
try:
return self.soup.find(lambda tag: tag.name == "span" and "Name" in tag.text).parent.find_next('td').contents[0].strip()
except:
return None

def _get_addresses_from_fn(self, fn):
try:
fp = open(fn)
return list(filter(None,
map(lambda x: x.strip(),
fp.readlines())
))
finally:
fp.close()

def _extract_text_from_html(self, s):
return re.sub('<[^<]+?>', '', s).strip()

def _extract_hexstr_from_html_attrib(self, s):
return ''.join(re.findall(r".+/([^']+)'", s)) if ">" in s and "</" in s else s

def _get_balance(self, balance):
try:
return int(re.sub('[a-zA-Z]', '', balance))
except ValueError:
return None

def _get_pageable_data(self, path, start=0, length=10):
params = {
"start": start,
"length": length,
}
resp = self.session.get(path, params=params).json()
# cleanup HTML from response
for item in resp['data']:
keys = item.keys()
for san_k in set(keys).intersection(set(("account", "blocknumber", "type", "direction"))):
item[san_k] = self._extract_text_from_html(item[san_k])
for san_k in set(keys).intersection(("parenthash", "from", "to", "address")):
item[san_k] = self._extract_hexstr_from_html_attrib(
item[san_k])
return resp

def _parse_tbodies(self, data):
tbodies = []
for tbody in re.findall(r"<tbody.*?>(.+?)</tbody>", data, re.DOTALL):
rows = []
for tr in re.findall(r"<tr.*?>(.+?)</tr>", tbody):
rows.append(re.findall(r"<td.*?>(.+?)</td>", tr))
tbodies.append(rows)
return tbodies


if __name__ == "__main__":
pass
e = EtherScanIoApi()
e.get_contracts_from_block(6000003)
Loading