aphd · apierr · Sep 12, 2019 · Sep 12, 2019 · Sep 12, 2019 · Sep 16, 2019
diff --git a/.gitignore b/.gitignore
@@ -2,10 +2,13 @@ dependency-reduced-pom.xml
 target/
 .DS_Store
 __pycache__
+sol
 
 # Jupyter Notebook
 examples/.ipynb_checkpoints
 examples/*.ipynb
+examples/*html_files*
 
 #Editor 
 *.vscode
+*tar.bz2
diff --git a/examples/client.py b/examples/client.py
@@ -7,8 +7,9 @@ def download():
     e = EtherScanIoApi()
 
     gap = 5760  # 1 block each  day
-    start = 3000000
+    start = 5000000
     end = 8000000
+    #end = 8000000
 
     import random
     for block in range(start, end, gap):

diff --git a/examples/downloadSCs.py b/examples/downloadSCs.py
@@ -11,12 +11,12 @@
     contracts.json
 """
 from pyetherchain.pyetherchain import UserAgent
-from pyetherchain.pyetherchain import EtherChain
 import configparser
 import re
 import requests
 import sys
 from bs4 import BeautifulSoup
+import traceback
 
 
 class EtherScanIoApi(object):
@@ -34,106 +34,37 @@ def __init__(self, proxies={}):
         self.config.read('config.ini')
         self.session = UserAgent(
             baseurl="https://etherscan.io", retry=5, retrydelay=8, proxies=proxies)
-        self.ec = EtherChain()
         self.soup = None
 
     def get_contracts_from_block(self, block):
+
         soup = BeautifulSoup(requests.get(
             'https://etherscan.io/txs?block=' + str(block)).text, features="html.parser")
         addresses = soup.select("i[title='Contract']")
+
         for address in list(set(map(lambda x: x.findNext('a')['href'].replace('/address/', ''), addresses))):
-            if not self._is_new_address(address):
-                continue
-            describe_contract = self.ec.account(address).describe_contract
+            print(address)
             self._set_soup(address)
-            contract = {'address': address,
-                        'name': self._get_contract_name(),
-                        'compiler': None,
-                        'compiler_version': self._get_compiler_version(),
-                        'balance': describe_contract.__self__['balance'],
-                        'txcount': describe_contract.__self__['txreceived'],
-                        'firstseen': describe_contract.__self__['firstseen'],
-                        'lastseen': describe_contract.__self__['lastseen']
-                        }
-            yield contract
-
-    def get_contracts_from_etherscan(self, start=0, end=None):
-        page = start
-
-        while not end or page <= end:
-            resp = self.session.get("/contractsVerified/%d" % page).text
-            page, lastpage = re.findall(
-                r'Page <.*>(\d+)</.*> of <.*>(\d+)</.*>', resp)[0]
-            page, lastpage = int(page), int(lastpage)
-            if not end:
-                end = lastpage
-            rows = self._parse_tbodies(resp)[0]  # only use first tbody
-            for col in rows:
-                address = self._extract_text_from_html(col[0]).split(" ", 1)[0]
-                if not self._is_new_address(address):
-                    continue
-                describe_contract = self.ec.account(address).describe_contract
-                firstseen = describe_contract.__self__['firstseen']
-                lastseen = describe_contract.__self__['lastseen']
-                contract = {'address': address,
-                            'name': self._extract_text_from_html(col[1]),
-                            'compiler': self._extract_text_from_html(col[2]),
-                            'compiler_version': self._extract_text_from_html(col[3]),
-                            'balance': self._get_balance(self._extract_text_from_html(col[4])),
-                            'txcount': self._extract_text_from_html(col[5]),
-                            'firstseen': firstseen,
-                            'lastseen': firstseen
-                            }
-                yield contract
-            page += 1
-
-    def write_etherChain_fn(self, contracts=[]):
-        amount = 100
-        for nr, c in enumerate(contracts):
-            with open(self.config['DEFAULT']['etherChain_fn'], 'a+') as f:
-                print("got contract: %s" % c)
-
-                f_path = os.path.join(
-                    self.config['DEFAULT']['output_path'], '%s.sol' % (c["address"]))
-                try:
-                    source = self._get_contract_source(c["address"]).strip()
-                    if not len(source):
-                        raise Exception(c)
-                except Exception as e:
-                    continue
-
-                f.write("%s\n" % c)
-                with open(f_path, "wb") as f:
-                    f.write(bytes(source, "utf8"))
-
-                print("[%d/%d] dumped --> %s (%-20s) -> %s" %
-                      (nr, amount, c["address"], c["name"], f_path))
+            self.write_etherChain_fn(address)
+            # yield contract
 
-                nr += 1
-                if nr >= amount:
-                    break
+    def write_etherChain_fn(self, address):
+        with open(self.config['DEFAULT']['etherChain_fn'], 'a+') as f:
+            print("got contract: %s" % address)
+            dir_path = os.path.join(
+                self.config['DEFAULT']['output_path'] + address[0:4])
+            f_path = os.path.join(dir_path, '%s.html' % (address))
 
-    def _get_contract_source(self, address):
-        import time
-        e = None
-        for _ in range(5):
-            resp = self.session.get("/address/%s" % address).text
-            print("/address/%s" % address)
-            if "You have reached your maximum request limit for this resource. Please try again later" in resp:
-                print("[[THROTTELING]]")
-                time.sleep(1+2.5*_)
-                continue
             try:
-                print("=======================================================")
-                print(address)
-                resp = resp.split(
-                    "<pre class='js-sourcecopyarea editor' id='editor' style='margin-top: 5px;'>", 1)[1]
-                resp = resp.split("</pre><br>", 1)[0]
-                return resp.replace("&lt;", "<").replace("&gt;", ">").replace("&le;", "<=").replace("&ge;", ">=").replace("&amp;", "&").replace("&vert;", "|")
+                source = self.soup.find(id="editor").text
+                abi = self.soup.find(id="js-copytextarea2").text
+                byteCode = self.soup.find(id="verifiedbytecode2").text
+                os.makedirs(dir_path, exist_ok=True)
+                with open(f_path, "wb") as f:
+                    f.write(bytes(source + '\n### ABI:\n' + abi +
+                                  '\n### byteCode:\n' + byteCode, "utf8"))
             except:
                 print(traceback.format_exc())
-                time.sleep(1 + 2.5 * _)
-                break
 
     def _is_new_address(self, address):
         if (address not in open(self.config['DEFAULT']['smec_fn']).read()):
@@ -144,67 +75,7 @@ def _set_soup(self, address):
         url = address.join(['https://etherscan.io/address/', '#code'])
         self.soup = BeautifulSoup(requests.get(url).text, 'html.parser')
 
-    def _get_compiler_version(self):
-        try:
-            str = self.soup.findAll('span', text=re.compile('v0.'))[
-                0].contents[0]
-            return re.search('v(\d{1,2}.\d{1,2}.\d{1,2})', str)[1]
-        except IndexError:
-            return None
-
-    def _get_contract_name(self):
-        try:
-            return self.soup.find(lambda tag: tag.name == "span" and "Name" in tag.text).parent.find_next('td').contents[0].strip()
-        except:
-            return None
-
-    def _get_addresses_from_fn(self, fn):
-        try:
-            fp = open(fn)
-            return list(filter(None,
-                               map(lambda x: x.strip(),
-                                   fp.readlines())
-                               ))
-        finally:
-            fp.close()
-
-    def _extract_text_from_html(self, s):
-        return re.sub('<[^<]+?>', '', s).strip()
-
-    def _extract_hexstr_from_html_attrib(self, s):
-        return ''.join(re.findall(r".+/([^']+)'", s)) if ">" in s and "</" in s else s
-
-    def _get_balance(self, balance):
-        try:
-            return int(re.sub('[a-zA-Z]', '', balance))
-        except ValueError:
-            return None
-
-    def _get_pageable_data(self, path, start=0, length=10):
-        params = {
-            "start": start,
-            "length": length,
-        }
-        resp = self.session.get(path, params=params).json()
-        # cleanup HTML from response
-        for item in resp['data']:
-            keys = item.keys()
-            for san_k in set(keys).intersection(set(("account", "blocknumber", "type", "direction"))):
-                item[san_k] = self._extract_text_from_html(item[san_k])
-            for san_k in set(keys).intersection(("parenthash", "from", "to", "address")):
-                item[san_k] = self._extract_hexstr_from_html_attrib(
-                    item[san_k])
-        return resp
-
-    def _parse_tbodies(self, data):
-        tbodies = []
-        for tbody in re.findall(r"<tbody.*?>(.+?)</tbody>", data, re.DOTALL):
-            rows = []
-            for tr in re.findall(r"<tr.*?>(.+?)</tr>", tbody):
-                rows.append(re.findall(r"<td.*?>(.+?)</td>", tr))
-            tbodies.append(rows)
-        return tbodies
-
 
 if __name__ == "__main__":
-    pass
+    e = EtherScanIoApi()
+    e.get_contracts_from_block(6000003)