基于Python爬取转转30W二手交易数据

*爬取结果

1.获取二手商品分类Link

import requests
from bs4 import BeautifulSoup
import pymongo
client = pymongo.MongoClient('localhost',27017)
db_project = client['58project']
cateLinks = db_project['cateLinks']

def getCateLink():
    baseUrl = 'http://sz.58.com/sale.shtml'
    wbData = requests.get(baseUrl)
    soup = BeautifulSoup(wbData.text,'lxml')
    links = soup.select('ul.ym-submnu > li > b > a')
    return list(set('http://sz.58.com{}'.format(link.get('href')) for link in links))

links = getCateLink()
for link in links:
    cateLinks.insert_one({'cateLinks':link})

2.获取二手商品Link

import requests
from bs4 import BeautifulSoup
import time
import random
import pymongo
import re

client = pymongo.MongoClient('localhost', 27017)
db_project = client['58project']
goodsLinksSheet = db_project['new_goodsLinks']
errorNote = db_project['errorNote']


def getGoodsLink(cateLink, page):
    url = '{}pn{}/'.format(cateLink, str(page))
    wbData = requests.get(url)
    if not wbData:
        errorNote.insert_one({'cateLink':cateLink,'page':page})
        return False
    soup = BeautifulSoup(wbData.text, 'lxml')
    if not soup.find('div', 'noinfotishi'):
        time.sleep(random.randrange(2, 4))
        for link in soup.find_all('a', class_='t', href=re.compile('zhuanzhuan.58')):
            goodsLink = link.get('href').split('?')[0]
            goodsLinksSheet.insert_one({'goodsLink': goodsLink})
            print(cateLink,'|',page,'|', goodsLink)
    else:
        return False

3.获取二手商品详情信息并逐条插入MongoDB

import requests
from bs4 import BeautifulSoup
import time
from getMongoData import getData
import pymongo
import random
import re

client = pymongo.MongoClient('localhost', 27017)
projectDb = client['58project']
goodsInfo = projectDb['goodsInfo']
fiter = projectDb['fiter']


def getGoodsinfo(goodsLink):
    if goodsInfo.find({'goodsLink': goodsLink}).count() == 0:
        time.sleep(random.randrange(3, 5))
        wbData = requests.get(goodsLink)
        if wbData.status_code == 200:
            soup = BeautifulSoup(wbData.text, 'lxml')
            if soup.find('p',class_ = 'personal_name'):
                name = soup.find('p', class_='personal_name').get_text() if soup.find('p', class_='personal_name') else None
                join58Age = re.sub('\D', '', soup.find('p', class_='personal_chengjiu').get_text()) if soup.find('p',class_='personal_chengjiu') else 0
                orderNum = soup.find('span', class_='numdeal').get_text() if soup.find('span', class_='numdeal') else 0
                title = soup.find('h1', class_='info_titile').get_text() if soup.find('h1', class_='info_titile') else None
                viewTimes = re.sub('\D', '', soup.find('span', class_='look_time').get_text()) if soup.find('span', class_='look_time') else 0
                price = soup.select('.price_now  i')[0].get_text() if soup.select('.price_now  i') else 0
                address = soup.select('.palce_li  i')[0].get_text().split('-') if soup.select('.palce_li  i') else None
                bodyPic = list(map(lambda x: x.get('src'), soup.select('div.boby_pic > img'))) if soup.select(
                    'div.boby_pic > img') else None
                describe = soup.select('.baby_kuang p')[0].get_text() if soup.select('.baby_kuang p') else None
                headImgLink = soup.select('.personal_touxiang img')[0].get('src') if soup.select('.personal_touxiang img') else None
                bodyPic = '|'.join(bodyPic) if bodyPic != None else None
                data = {
                    'name': name,
                    'join58Age': int(join58Age) if join58Age != '' else 0,
                    'orderNum': int(orderNum),
                    'title': title,
                    'viewTimes': int(viewTimes) if viewTimes != '' else 0,
                    'price': int(price) if price.isdigit() else 0,
                    'address': address,
                    'describe': describe,
                    'headImgLink': headImgLink,
                    'bodyPic': bodyPic,
                    'goodsLink': goodsLink
                }
                goodsInfo.insert_one(data)
                fiter.insert_one({'url': goodsLink})
                print(data, '\n')
    else:
        print(goodsLink)

def deleteData():
    res = goodsInfo.delete_many({'name': ''})
    print(res)

def getLineNum():
    # goodsLink = getData('new_goodsLinks')
    # res = set(map(lambda x:x['goodsLink'],goodsLink))
    res = goodsInfo.distinct('goodsLink')
    print(len(res))

def repeat():
    links = goodsInfo.find()
    tmp = set()
    for link in links:
        if link['goodsLink'] not in tmp:
            tmp.add(link['goodsLink'])
        else:
            # goodsInfo.delete_one({'goodsLink':link['goodsLink']})
            print(link)

4.获取MongoDB已存储的数据

import pymongo

def getData(sheetName):
    client = pymongo.MongoClient('localhost',27017)
    projectDb = client['58project']
    sheetObj = projectDb[sheetName]
    return sheetObj.find()

5.执行入口主文件,多线程异步执行爬去任务

from getGoodsLinks import getGoodsLink
from multiprocessing import Pool
from getMongoData import getData
from getGoodsInfo import getGoodsinfo
from getGoodsInfo import deleteData
from getGoodsInfo import getLineNum
from getGoodsInfo import repeat


def startSpider(cateLink):
    for page in range(1, 101):
        if not getGoodsLink(cateLink, str(page)):
            continue


# if __name__ == '__main__':
#     pool = Pool(processes = 8)
#     links = getCateLink()
#     pool.map(startSpider,links)
# 
# if __name__ == '__main__':
#     goodsLink = getData('new_goodsLinks')
#     urlPond = list(map(lambda x:x['goodsLink'],goodsLink))
#     pool = Pool(processes=4)
#     pool.map(getGoodsinfo, urlPond)
# 
# if __name__ == '__main__':
#     getLineNum()

 

Mysql查询复习

-- 1、查询"01"课程比"02"课程成绩高的学生的信息及课程分数
SELECT
  a.*,
  b.s_score AS 01_score,
  c.s_score AS 02_score
FROM
  student a
  JOIN score b ON a.s_id = b.s_id AND b.c_id = '01'
  LEFT JOIN score c ON a.s_id = c.s_id AND c.c_id = '02' OR c.c_id = NULL
WHERE b.s_score > c.s_score;

SELECT
  st.*,
  sc1.s_score AS 01_score,
  sc2.s_score AS 02_score
FROM score AS sc1, score AS sc2, student AS st
WHERE sc1.c_id = '01' AND (sc2.c_id = '02' OR sc2.c_id = NULL) AND sc1.s_id = sc2.s_id AND sc1.s_score > sc2.s_score AND
      sc1.s_id = st.s_id;

-- 2、查询"01"课程比"02"课程成绩低的学生的信息及课程分数

SELECT
  a.*,
  b.s_score AS 01_score,
  c.s_score AS 02_score
FROM
  student a LEFT JOIN score b ON a.s_id = b.s_id AND b.c_id = '01' OR b.c_id = NULL
  JOIN score c ON a.s_id = c.s_id AND c.c_id = '02'
WHERE b.s_score < c.s_score;

SELECT
  st.*,
  sc1.s_score AS 01_score,
  sc2.s_score AS 02_score
FROM score AS sc1, score AS sc2, student AS st
WHERE sc1.c_id = '01' AND (sc2.c_id = '02' OR sc2.c_id = NULL) AND sc1.s_id = sc2.s_id AND sc1.s_score < sc2.s_score AND
      sc1.s_id = st.s_id;

-- 3、查询平均成绩大于等于60分的同学的学生编号和学生姓名和平均成绩
SELECT
  b.s_id,
  b.s_name,
  ROUND(AVG(a.s_score), 2) AS avg_score
FROM
  student b
  JOIN score a ON b.s_id = a.s_id
GROUP BY b.s_id, b.s_name
HAVING ROUND(AVG(a.s_score), 2) >= 60;

SELECT
  st.*,
  round(avg(s_score), 2) AS avg_score
FROM score AS sc, student AS st
WHERE sc.s_id = st.s_id
GROUP BY s_id
HAVING avg(s_score) >= 60;

-- 4、查询平均成绩小于60分的同学的学生编号和学生姓名和平均成绩
-- (包括有成绩的和无成绩的)

SELECT
  b.s_id,
  b.s_name,
  ROUND(AVG(a.s_score), 2) AS avg_score
FROM
  student b
  LEFT JOIN score a ON b.s_id = a.s_id
GROUP BY b.s_id, b.s_name
HAVING ROUND(AVG(a.s_score), 2) < 60
UNION
SELECT
  a.s_id,
  a.s_name,
  0 AS avg_score
FROM
  student a
WHERE a.s_id NOT IN (
  SELECT DISTINCT s_id
  FROM score);

-- 5、查询所有同学的学生编号、学生姓名、选课总数、所有课程的总成绩
SELECT
  a.s_id,
  a.s_name,
  count(b.c_id)  AS sum_course,
  sum(b.s_score) AS sum_score
FROM
  student a
  LEFT JOIN score b ON a.s_id = b.s_id
GROUP BY a.s_id, a.s_name;


SELECT
  st.s_id,
  st.s_name,
  count(sc.c_id),
  sum(sc.s_score)
FROM student AS st LEFT JOIN score AS sc ON st.s_id = sc.s_id
GROUP BY st.s_id;

-- 6、查询"李"姓老师的数量
SELECT count(*)
FROM teacher
WHERE t_name LIKE '李%';

-- 7、查询学过"张三"老师授课的同学的信息
SELECT a.*
FROM
  student a
  JOIN score b ON a.s_id = b.s_id
WHERE b.c_id IN (
  SELECT c_id
  FROM course
  WHERE t_id = (
    SELECT t_id
    FROM teacher
    WHERE t_name = '张三'));

SELECT *
FROM student AS st
WHERE st.s_id IN (SELECT sc.s_id
                  FROM score AS sc
                  WHERE sc.c_id IN (SELECT co.c_id
                                    FROM course AS co
                                    WHERE co.t_id IN (SELECT t_id
                                                      FROM teacher AS te
                                                      WHERE te.t_name = '张三')));

-- 8、查询没学过"张三"老师授课的同学的信息
SELECT *
FROM
  student c
WHERE c.s_id NOT IN (
  SELECT a.s_id
  FROM student a
    JOIN score b ON a.s_id = b.s_id
  WHERE b.c_id IN (
    SELECT c_id
    FROM course
    WHERE t_id = (
      SELECT t_id
      FROM teacher
      WHERE t_name = '张三')));

SELECT *
FROM student AS st
WHERE st.s_id NOT IN (SELECT sc.s_id
                      FROM score AS sc
                      WHERE sc.c_id IN (SELECT co.c_id
                                        FROM course AS co
                                        WHERE co.t_id IN (SELECT t_id
                                                          FROM teacher AS te
                                                          WHERE te.t_name = '张三')));
-- 9、查询学过编号为"01"并且也学过编号为"02"的课程的同学的信息

SELECT a.*
FROM
  student a, score b, score c
WHERE a.s_id = b.s_id AND a.s_id = c.s_id AND b.c_id = '01' AND c.c_id = '02';

SELECT st.*
FROM score sc_1, score sc_2, student st
WHERE sc_1.s_id = sc_2.s_id AND sc_1.c_id = '01' AND sc_2.c_id = '02' AND sc_1.s_id = st.s_id;

-- 10、查询学过编号为"01"但是没有学过编号为"02"的课程的同学的信息

SELECT a.*
FROM
  student a
WHERE a.s_id IN (SELECT s_id
                 FROM score
                 WHERE c_id = '01') AND a.s_id NOT IN (SELECT s_id
                                                       FROM score);

SELECT st.*
FROM score sc_1, score sc_2, student st
WHERE sc_1.s_id = sc_2.s_id AND sc_1.c_id = '01' AND sc_2.c_id = '02' AND sc_1.s_id = st.s_id;

-- 11、查询没有学全所有课程的同学的信息
SELECT s.*
FROM
  student s
WHERE s.s_id IN (
  SELECT s_id
  FROM score
  WHERE s_id NOT IN (
    SELECT a.s_id
    FROM score a
      JOIN score b ON a.s_id = b.s_id AND b.c_id = '02'
      JOIN score c ON a.s_id = c.s_id AND c.c_id = '03'
    WHERE a.c_id = '01'));

SELECT st.*
FROM student st
WHERE st.s_id IN (
  SELECT sc.s_id
  FROM score sc
  WHERE sc.s_id NOT IN (
    SELECT sc.s_id
    FROM score sc
    GROUP BY sc.s_id
    HAVING count(*) >= (SELECT count(*)
                        FROM course)));

-- 12、查询至少有一门课与学号为"01"的同学所学相同的同学的信息
SELECT *
FROM student
WHERE s_id IN (
  SELECT DISTINCT a.s_id
  FROM score a
  WHERE a.c_id IN (SELECT a.c_id
                   FROM score a
                   WHERE a.s_id = '01')
);


SELECT st.*
FROM student st
WHERE st.s_id IN (SELECT DISTINCT s_id
                  FROM score
                  WHERE c_id IN (SELECT sc.c_id
                                 FROM score sc
                                 WHERE sc.s_id = '01'));

-- 13、查询和"01"号的同学学习的课程完全相同的其他同学的信息
SELECT a.*
FROM student a
WHERE a.s_id IN (
  SELECT DISTINCT s_id
  FROM score
  WHERE s_id != '01' AND c_id IN (SELECT c_id
                                  FROM score
                                  WHERE s_id = '01')
  GROUP BY s_id
  HAVING count(1) = (SELECT count(1)
                     FROM score
                     WHERE s_id = '01'));


SELECT st.*
FROM student st
WHERE st.s_id != '01' AND st.s_id IN (
  SELECT s_id
  FROM score
  WHERE c_id IN (SELECT c_id
                 FROM score
                 WHERE s_id = '01')
  GROUP BY s_id
  HAVING count(1) = (SELECT count(1)
                     FROM score
                     WHERE s_id = '01'));

-- 14、查询没学过"张三"老师讲授的任一门课程的学生姓名
SELECT a.s_name
FROM student a
WHERE a.s_id NOT IN (
  SELECT s_id
  FROM score
  WHERE c_id =
        (SELECT c_id
         FROM course
         WHERE t_id = (
           SELECT t_id
           FROM teacher
           WHERE t_name = '张三'))
  GROUP BY s_id);

SELECT s_name
FROM student
WHERE s_id NOT IN (
  SELECT s_id
  FROM score
  WHERE c_id = (SELECT c_id
                FROM teacher te, course co
                WHERE te.t_name = '张三' AND co.t_id = te.t_id));