{"id":32,"date":"2018-03-22T13:13:38","date_gmt":"2018-03-22T05:13:38","guid":{"rendered":"http:\/\/www.sky94.com\/?p=32"},"modified":"2018-03-22T13:13:38","modified_gmt":"2018-03-22T05:13:38","slug":"%e5%9f%ba%e4%ba%8epython%e7%88%ac%e5%8f%96%e8%bd%ac%e8%bd%ac30w%e4%ba%8c%e6%89%8b%e4%ba%a4%e6%98%93%e6%95%b0%e6%8d%ae","status":"publish","type":"post","link":"https:\/\/www.sky94.com\/?p=32","title":{"rendered":"\u57fa\u4e8ePython\u722c\u53d6\u8f6c\u8f6c30W\u4e8c\u624b\u4ea4\u6613\u6570\u636e"},"content":{"rendered":"<p>*\u722c\u53d6\u7ed3\u679c<\/p>\n<p><img decoding=\"async\" loading=\"lazy\" class=\"alignnone size-full wp-image-33\" src=\"http:\/\/www.sky94.com\/wp-content\/uploads\/2018\/03\/2.png\" alt=\"\" width=\"1020\" height=\"356\" srcset=\"https:\/\/www.sky94.com\/wp-content\/uploads\/2018\/03\/2.png 1020w, https:\/\/www.sky94.com\/wp-content\/uploads\/2018\/03\/2-300x105.png 300w, https:\/\/www.sky94.com\/wp-content\/uploads\/2018\/03\/2-768x268.png 768w\" sizes=\"(max-width: 1020px) 100vw, 1020px\" \/><\/p>\n<p>1.\u83b7\u53d6\u4e8c\u624b\u5546\u54c1\u5206\u7c7bLink<\/p>\n<pre>import requests\r\nfrom bs4 import BeautifulSoup\r\nimport pymongo\r\nclient = pymongo.MongoClient('localhost',27017)\r\ndb_project = client['58project']\r\ncateLinks = db_project['cateLinks']\r\n\r\ndef getCateLink():\r\n    baseUrl = 'http:\/\/sz.58.com\/sale.shtml'\r\n    wbData = requests.get(baseUrl)\r\n    soup = BeautifulSoup(wbData.text,'lxml')\r\n    links = soup.select('ul.ym-submnu &gt; li &gt; b &gt; a')\r\n    return list(set('http:\/\/sz.58.com{}'.format(link.get('href')) for link in links))\r\n\r\nlinks = getCateLink()\r\nfor link in links:\r\n    cateLinks.insert_one({'cateLinks':link})<\/pre>\n<p>2.\u83b7\u53d6\u4e8c\u624b\u5546\u54c1Link<\/p>\n<pre>import requests\r\nfrom bs4 import BeautifulSoup\r\nimport time\r\nimport random\r\nimport pymongo\r\nimport re\r\n\r\nclient = pymongo.MongoClient('localhost', 27017)\r\ndb_project = client['58project']\r\ngoodsLinksSheet = db_project['new_goodsLinks']\r\nerrorNote = db_project['errorNote']\r\n\r\n\r\ndef getGoodsLink(cateLink, page):\r\n    url = '{}pn{}\/'.format(cateLink, str(page))\r\n    wbData = requests.get(url)\r\n    if not wbData:\r\n        errorNote.insert_one({'cateLink':cateLink,'page':page})\r\n        return False\r\n    soup = BeautifulSoup(wbData.text, 'lxml')\r\n    if not soup.find('div', 'noinfotishi'):\r\n        time.sleep(random.randrange(2, 4))\r\n        for link in soup.find_all('a', class_='t', href=re.compile('zhuanzhuan.58')):\r\n            goodsLink = link.get('href').split('?')[0]\r\n            goodsLinksSheet.insert_one({'goodsLink': goodsLink})\r\n            print(cateLink,'|',page,'|', goodsLink)\r\n    else:\r\n        return False<\/pre>\n<p>3.\u83b7\u53d6\u4e8c\u624b\u5546\u54c1\u8be6\u60c5\u4fe1\u606f\u5e76\u9010\u6761\u63d2\u5165MongoDB<\/p>\n<pre>import requests\r\nfrom bs4 import BeautifulSoup\r\nimport time\r\nfrom getMongoData import getData\r\nimport pymongo\r\nimport random\r\nimport re\r\n\r\nclient = pymongo.MongoClient('localhost', 27017)\r\nprojectDb = client['58project']\r\ngoodsInfo = projectDb['goodsInfo']\r\nfiter = projectDb['fiter']\r\n\r\n\r\ndef getGoodsinfo(goodsLink):\r\n    if goodsInfo.find({'goodsLink': goodsLink}).count() == 0:\r\n        time.sleep(random.randrange(3, 5))\r\n        wbData = requests.get(goodsLink)\r\n        if wbData.status_code == 200:\r\n            soup = BeautifulSoup(wbData.text, 'lxml')\r\n            if soup.find('p',class_ = 'personal_name'):\r\n                name = soup.find('p', class_='personal_name').get_text() if soup.find('p', class_='personal_name') else None\r\n                join58Age = re.sub('\\D', '', soup.find('p', class_='personal_chengjiu').get_text()) if soup.find('p',class_='personal_chengjiu') else 0\r\n                orderNum = soup.find('span', class_='numdeal').get_text() if soup.find('span', class_='numdeal') else 0\r\n                title = soup.find('h1', class_='info_titile').get_text() if soup.find('h1', class_='info_titile') else None\r\n                viewTimes = re.sub('\\D', '', soup.find('span', class_='look_time').get_text()) if soup.find('span', class_='look_time') else 0\r\n                price = soup.select('.price_now  i')[0].get_text() if soup.select('.price_now  i') else 0\r\n                address = soup.select('.palce_li  i')[0].get_text().split('-') if soup.select('.palce_li  i') else None\r\n                bodyPic = list(map(lambda x: x.get('src'), soup.select('div.boby_pic &gt; img'))) if soup.select(\r\n                    'div.boby_pic &gt; img') else None\r\n                describe = soup.select('.baby_kuang p')[0].get_text() if soup.select('.baby_kuang p') else None\r\n                headImgLink = soup.select('.personal_touxiang img')[0].get('src') if soup.select('.personal_touxiang img') else None\r\n                bodyPic = '|'.join(bodyPic) if bodyPic != None else None\r\n                data = {\r\n                    'name': name,\r\n                    'join58Age': int(join58Age) if join58Age != '' else 0,\r\n                    'orderNum': int(orderNum),\r\n                    'title': title,\r\n                    'viewTimes': int(viewTimes) if viewTimes != '' else 0,\r\n                    'price': int(price) if price.isdigit() else 0,\r\n                    'address': address,\r\n                    'describe': describe,\r\n                    'headImgLink': headImgLink,\r\n                    'bodyPic': bodyPic,\r\n                    'goodsLink': goodsLink\r\n                }\r\n                goodsInfo.insert_one(data)\r\n                fiter.insert_one({'url': goodsLink})\r\n                print(data, '\\n')\r\n    else:\r\n        print(goodsLink)\r\n\r\ndef deleteData():\r\n    res = goodsInfo.delete_many({'name': ''})\r\n    print(res)\r\n\r\ndef getLineNum():\r\n    # goodsLink = getData('new_goodsLinks')\r\n    # res = set(map(lambda x:x['goodsLink'],goodsLink))\r\n    res = goodsInfo.distinct('goodsLink')\r\n    print(len(res))\r\n\r\ndef repeat():\r\n    links = goodsInfo.find()\r\n    tmp = set()\r\n    for link in links:\r\n        if link['goodsLink'] not in tmp:\r\n            tmp.add(link['goodsLink'])\r\n        else:\r\n            # goodsInfo.delete_one({'goodsLink':link['goodsLink']})\r\n            print(link)\r\n\r\n<\/pre>\n<p>4.\u83b7\u53d6MongoDB\u5df2\u5b58\u50a8\u7684\u6570\u636e<\/p>\n<pre>import pymongo\r\n\r\ndef getData(sheetName):\r\n    client = pymongo.MongoClient('localhost',27017)\r\n    projectDb = client['58project']\r\n    sheetObj = projectDb[sheetName]\r\n    return sheetObj.find()<\/pre>\n<p>5.\u6267\u884c\u5165\u53e3\u4e3b\u6587\u4ef6\uff0c\u591a\u7ebf\u7a0b\u5f02\u6b65\u6267\u884c\u722c\u53bb\u4efb\u52a1<\/p>\n<pre>from getGoodsLinks import getGoodsLink\r\nfrom multiprocessing import Pool\r\nfrom getMongoData import getData\r\nfrom getGoodsInfo import getGoodsinfo\r\nfrom getGoodsInfo import deleteData\r\nfrom getGoodsInfo import getLineNum\r\nfrom getGoodsInfo import repeat\r\n\r\n\r\ndef startSpider(cateLink):\r\n    for page in range(1, 101):\r\n        if not getGoodsLink(cateLink, str(page)):\r\n            continue\r\n\r\n\r\n# if __name__ == '__main__':\r\n#     pool = Pool(processes = 8)\r\n#     links = getCateLink()\r\n#     pool.map(startSpider,links)\r\n# \r\n# if __name__ == '__main__':\r\n#     goodsLink = getData('new_goodsLinks')\r\n#     urlPond = list(map(lambda x:x['goodsLink'],goodsLink))\r\n#     pool = Pool(processes=4)\r\n#     pool.map(getGoodsinfo, urlPond)\r\n# \r\n# if __name__ == '__main__':\r\n#     getLineNum()\r\n\r\n<\/pre>\n<p>&nbsp;<\/p>\n","protected":false},"excerpt":{"rendered":"<p>*\u722c\u53d6\u7ed3\u679c 1.\u83b7\u53d6\u4e8c\u624b\u5546\u54c1\u5206\u7c7bLink import requests from bs4 import Be &hellip; <a href=\"https:\/\/www.sky94.com\/?p=32\" class=\"more-link\">\u7ee7\u7eed\u9605\u8bfb<span class=\"screen-reader-text\">\u57fa\u4e8ePython\u722c\u53d6\u8f6c\u8f6c30W\u4e8c\u624b\u4ea4\u6613\u6570\u636e<\/span><\/a><\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":[],"categories":[],"tags":[11],"_links":{"self":[{"href":"https:\/\/www.sky94.com\/index.php?rest_route=\/wp\/v2\/posts\/32"}],"collection":[{"href":"https:\/\/www.sky94.com\/index.php?rest_route=\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.sky94.com\/index.php?rest_route=\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.sky94.com\/index.php?rest_route=\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/www.sky94.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcomments&post=32"}],"version-history":[{"count":1,"href":"https:\/\/www.sky94.com\/index.php?rest_route=\/wp\/v2\/posts\/32\/revisions"}],"predecessor-version":[{"id":34,"href":"https:\/\/www.sky94.com\/index.php?rest_route=\/wp\/v2\/posts\/32\/revisions\/34"}],"wp:attachment":[{"href":"https:\/\/www.sky94.com\/index.php?rest_route=%2Fwp%2Fv2%2Fmedia&parent=32"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.sky94.com\/index.php?rest_route=%2Fwp%2Fv2%2Fcategories&post=32"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.sky94.com\/index.php?rest_route=%2Fwp%2Fv2%2Ftags&post=32"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}