wesley
/
lixiaobao


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261
							# -*- coding: utf-8 -*-
import scrapy
import json
import pymysql.cursors
import time
import sys
import random

reload(sys)
sys.setdefaultencoding('utf8')


class LixiaosknbSpider(scrapy.Spider):
    name = 'lixiaosknb'
    allowed_domains = ['biz.lixiaoskb.com']
    start_urls = ['https://biz.lixiaoskb.com/login']

    headers = {
        'Connection': 'keep-alive',  # 保持链接状态
        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36',
        'Content-Type': 'application/json'
    }

    global company_id

    def __init__(self):
        self.file = open('items.json', 'wb')
        self.connect = pymysql.connect(host='localhost',
                                       user='root',
                                       password='root',
                                       db='swdz_crm',
                                       charset='utf8mb4',
                                       cursorclass=pymysql.cursors.DictCursor)

        self.cursor = self.connect.cursor()

    def parse(self, response):
        yield scrapy.FormRequest(
            url='https://biz.lixiaoskb.com/api/user/login',
            formdata={'username': '17381599246', 'password': 'yj870102722'},
            callback=self.after_login
        )

    def after_login(self, response):
        res = json.loads(response.body)
        print(res['data'])
        token = res['data']['token']
        self.headers['Authorization'] = token.encode('utf-8')

        for i in range(1, 2):
            myFormData = {
                "keyword": "",
                "filter": "{\"location\":[\"5101\"],\"industryshort\":[],\"registercapital\":\"0\",\"establishment\":\"0\",\"entstatus\":\"0\",\"enttype\":\"0\"}",
                "scope": "",
                "pagesize": "50",
                "page": i
            }
            print(i)
            if i % 10 == 0:
                time.sleep(random.randint(5, 30))

            time.sleep(0.5)
            yield scrapy.Request('https://biz.lixiaoskb.com/api/opensearch/search',
                                 method="POST",
                                 body=json.dumps(myFormData),
                                 headers=self.headers,
                                 callback=self.parse_item)

    def parse_item(self, response):
        body = json.loads(response.body)
        items = body['data']['items']
        i = 0
        for item in items:
            i = i+1
            id = item['id']
            market_company = item['value']

            time.sleep(random.random())
            if i % 10 == 0:
                time.sleep(random.randint(5, 10))

            # 获取基本信息
            self.headers['Referer'] = 'https://biz.lixiaoskb.com/report/' + id + '?company=' + market_company + '&source=search'
            yield scrapy.Request(
                'https://biz.lixiaoskb.com/api/opensearch/marketReport?id=' + id + '&market_company=' + market_company + '&market_source=search',
                headers=self.headers,
                meta={'id': id, 'name': market_company},
                callback=self.get_baseinfo
            )

    # 获取基本信息
    def get_baseinfo(self, response):
        body = json.loads(response.body)
        id = response.meta['id']
        name = response.meta['name']
        data = body['data']['baseinfo']

        annual_date = self.format_time(data['apprdate'])
        start_date = self.format_time(data['esdate'])
        opfrom = data['opfrom'] if data.has_key('opfrom') else ''
        opto = data['opto'] if data.has_key('opto') else ''
        officialWebsite = data['officialWebsite'] if data.has_key('officialWebsite') else ''
        regno = data['regno'] if data.has_key('regno') else ''
        legalperson = data['legalperson'] if data.has_key('legalperson') else ''
        address = data['address'] if data.has_key('address') else ''
        businessscope = data['businessscope'] if data.has_key('businessscope') else ''
        entstatus = data['entstatus'] if data.has_key('entstatus') else ''
        enttype = data['enttype'] if data.has_key('enttype') else ''
        industry = data['industry'] if data.has_key('industry') else ''
        regcap = data['regcap'] if data.has_key('regcap') else ''

        opentime = self.format_time(opfrom) + '-' + self.format_time(opto)
        self.cursor.execute(
            """insert into company_info(company_name, reg_no, legal_person, reg_addr, scope, open_status, ent_type, industry, annual_date,start_date,open_time,reg_capital,website)
            value (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""",
            (data['entname'], regno, legalperson, address, businessscope,
             entstatus, enttype, industry, annual_date, start_date, opentime, regcap,
             officialWebsite))
        # 提交sql语句
        self.connect.commit()
        company_id = self.cursor.lastrowid

        time.sleep(0.5)

        self.headers['Referer'] = 'https://biz.lixiaoskb.com/report/' + id + '?company=' + name + '&source=search'
        # 获取旗下网站信息
        yield scrapy.Request(
            'https://biz.lixiaoskb.com/api/opensearch/getDomains?id=' + id,
            headers=self.headers,
            meta={'company_id': company_id},
            callback=self.get_webs
        )

        time.sleep(0.5)
        # 获取股东信息
        yield scrapy.Request(
            'https://biz.lixiaoskb.com/api/opensearch/querySpecifiedSiteInfo?id=' + id + '&tag=3',
            headers=self.headers,
            meta={'company_id': company_id},
            callback=self.get_shareholders
        )

        time.sleep(0.5)
        # 获取招聘
        yield scrapy.Request(
            'https://biz.lixiaoskb.com/api/opensearch/querySpecifiedSiteInfo?id=' + id + '&tag=4',
            headers=self.headers,
            meta={'company_id': company_id, 'offset': 1, 'id': id, 'name': name},
            callback=self.get_jobs
        )

        time.sleep(0.5)
        # 获取网络推广
        yield scrapy.Request(
            'https://biz.lixiaoskb.com/api/opensearch/querySpecifiedSiteInfo?id=' + id + '&tag=5',
            headers=self.headers,
            meta={'company_id': company_id, 'offset': 1, 'id': id, 'name': name},
            callback=self.get_sem
        )

    # 获取网络推广
    def get_sem(self, response):
        body = json.loads(response.body)
        company_id = response.meta['company_id']
        offset = response.meta['offset']
        id = response.meta['id']
        name = response.meta['name']
        items = body['data']['item']
        if items:
            for item in items:
                keywords = ','.join(item['semKeywords'])
                self.cursor.execute(
                    """insert into company_sems(company_id, sem_date, sem_title, sem_url, keywords, source_name)
                    value (%s, %s, %s, %s, %s, %s)""",
                    (company_id, item['semDate'], item['semTitle'], item['semUrl'], keywords, item['sourceName'],
                     ))
                # 提交sql语句
                self.connect.commit()

            time.sleep(1)
            offset = offset + 1
            if offset % 5 == 0:
                time.sleep(5)

            self.headers['Referer'] = 'https://biz.lixiaoskb.com/report/' + id + '?company=' + name + '&source=search'
            yield scrapy.Request(
                'https://biz.lixiaoskb.com/api/opensearch/querySpecifiedSiteInfo?id=' + str(
                    id) + '&tag=5' + '&offset=' + str(offset),
                headers=self.headers,
                meta={'company_id': company_id, 'offset': offset, 'id': id, 'name':name},
                callback=self.get_sem
            )

    # 获取旗下网站信息
    def get_webs(self, response):
        body = json.loads(response.body)
        company_id = response.meta['company_id']
        items = body['data']['items']
        if items:
            for item in items:
                self.cursor.execute(
                    """insert into company_domains(company_id, site_domain, site_home, site_name)
                    value (%s, %s, %s, %s)""",
                    (company_id, item['SITEDOMAIN'], item['SITEHOME'], item['SITENAME']))
                # 提交sql语句
                self.connect.commit()

    # 获取股东信息
    def get_shareholders(self, response):
        body = json.loads(response.body)
        company_id = response.meta['company_id']
        if body['data'].has_key('BE_INVEST'):
            items = body['data']['BE_INVEST']['data']
            if items:
                for item in items:
                    self.cursor.execute(
                        """insert into company_shareholders(company_id, username, insto, amount, amount_paid)
                        value (%s, %s, %s, %s, %s)""",
                        (company_id, item['INV'], item['INSTO'], item['LIACCONAM'], item['LISUBCONAM']))
                    # 提交sql语句
                    self.connect.commit()

    # 获取招聘
    def get_jobs(self, response):
        body = json.loads(response.body)
        company_id = response.meta['company_id']
        offset = response.meta['offset']
        id = response.meta['id']
        name = response.meta['name']
        items = body['data']['item']
        if items:
            for item in items:
                self.cursor.execute(
                    """insert into company_jobs(company_id, job_name, location, salary, job_url, release_date, source_name)
                    value (%s, %s, %s, %s, %s, %s, %s)""",
                    (company_id, item['jobName'], item['location'], item['salary'], item['jobUrl'], item['releaseDate'],
                     item['sourceName']))
                # 提交sql语句
                self.connect.commit()

            time.sleep(1)
            offset = offset + 1
            if offset % 5 == 0:
                time.sleep(5)

            self.headers['Referer'] = 'https://biz.lixiaoskb.com/report/' + id + '?company=' + name + '&source=search'
            yield scrapy.Request(
                'https://biz.lixiaoskb.com/api/opensearch/querySpecifiedSiteInfo?id=' + str(
                    id) + '&tag=4' + '&offset=' + str(offset),
                headers=self.headers,
                meta={'company_id': company_id, 'offset': offset, 'id': id, 'name': name},
                callback=self.get_jobs
            )

    # 将时间格式化为时间戳
    def format_time(self, mytime):
        if mytime:
            timeStamp = float(mytime / 1000)
            timeArray = time.localtime(timeStamp)
            return time.strftime("%Y-%m-%d", timeArray)
        return ''