浏览代码

first commit

wesley 6 年之前
当前提交
52baae4fef
共有 17 个文件被更改,包括 736 次插入0 次删除
  1. 11 0
      .idea/lixiao.iml
  2. 7 0
      .idea/misc.xml
  3. 8 0
      .idea/modules.xml
  4. 216 0
      .idea/workspace.xml
  5. 0 0
      items.json
  6. 0 0
      lixiao/__init__.py
  7. 二进制
      lixiao/__init__.pyc
  8. 14 0
      lixiao/items.py
  9. 103 0
      lixiao/middlewares.py
  10. 11 0
      lixiao/pipelines.py
  11. 90 0
      lixiao/settings.py
  12. 二进制
      lixiao/settings.pyc
  13. 4 0
      lixiao/spiders/__init__.py
  14. 二进制
      lixiao/spiders/__init__.pyc
  15. 261 0
      lixiao/spiders/lixiaosknb.py
  16. 二进制
      lixiao/spiders/lixiaosknb.pyc
  17. 11 0
      scrapy.cfg

+ 11 - 0
.idea/lixiao.iml

xqd
@@ -0,0 +1,11 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="inheritedJdk" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+  <component name="TestRunnerService">
+    <option name="PROJECT_TEST_RUNNER" value="Unittests" />
+  </component>
+</module>

+ 7 - 0
.idea/misc.xml

xqd
@@ -0,0 +1,7 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="JavaScriptSettings">
+    <option name="languageLevel" value="ES6" />
+  </component>
+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6" project-jdk-type="Python SDK" />
+</project>

+ 8 - 0
.idea/modules.xml

xqd
@@ -0,0 +1,8 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/lixiao.iml" filepath="$PROJECT_DIR$/.idea/lixiao.iml" />
+    </modules>
+  </component>
+</project>

+ 216 - 0
.idea/workspace.xml

xqd
@@ -0,0 +1,216 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ChangeListManager">
+    <list default="true" id="41630725-fc3d-404c-bd38-79fa94d4509b" name="Default Changelist" comment="" />
+    <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
+    <option name="SHOW_DIALOG" value="false" />
+    <option name="HIGHLIGHT_CONFLICTS" value="true" />
+    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
+    <option name="LAST_RESOLUTION" value="IGNORE" />
+  </component>
+  <component name="FUSProjectUsageTrigger">
+    <session id="74139605">
+      <usages-collector id="statistics.lifecycle.project">
+        <counts>
+          <entry key="project.closed" value="6" />
+          <entry key="project.open.time.3" value="1" />
+          <entry key="project.open.time.4" value="2" />
+          <entry key="project.open.time.5" value="1" />
+          <entry key="project.open.time.6" value="2" />
+          <entry key="project.opened" value="6" />
+        </counts>
+      </usages-collector>
+      <usages-collector id="statistics.file.extensions.open">
+        <counts>
+          <entry key="py" value="3" />
+        </counts>
+      </usages-collector>
+      <usages-collector id="statistics.file.types.open">
+        <counts>
+          <entry key="Python" value="3" />
+        </counts>
+      </usages-collector>
+      <usages-collector id="statistics.file.extensions.edit">
+        <counts>
+          <entry key="py" value="4255" />
+        </counts>
+      </usages-collector>
+      <usages-collector id="statistics.file.types.edit">
+        <counts>
+          <entry key="Python" value="4255" />
+        </counts>
+      </usages-collector>
+    </session>
+  </component>
+  <component name="FileEditorManager">
+    <leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
+      <file pinned="false" current-in-tab="false">
+        <entry file="file://$PROJECT_DIR$/lixiao/spiders/__init__.py">
+          <provider selected="true" editor-type-id="text-editor" />
+        </entry>
+      </file>
+      <file pinned="false" current-in-tab="true">
+        <entry file="file://$PROJECT_DIR$/lixiao/spiders/lixiaosknb.py">
+          <provider selected="true" editor-type-id="text-editor">
+            <state relative-caret-position="748">
+              <caret line="250" column="45" lean-forward="true" selection-start-line="250" selection-start-column="45" selection-end-line="250" selection-end-column="45" />
+              <folding>
+                <element signature="e#24#37#0" expanded="true" />
+                <marker date="1549942144736" expanded="true" signature="262:263" ph="..." />
+                <marker date="1549942144736" expanded="true" signature="6262:6266" ph="..." />
+              </folding>
+            </state>
+          </provider>
+        </entry>
+      </file>
+      <file pinned="false" current-in-tab="false">
+        <entry file="file://$PROJECT_DIR$/lixiao/items.py">
+          <provider selected="true" editor-type-id="text-editor" />
+        </entry>
+      </file>
+    </leaf>
+  </component>
+  <component name="FindInProjectRecents">
+    <findStrings>
+      <find>parse</find>
+      <find>&quot;</find>
+      <find>self.id</find>
+    </findStrings>
+    <replaceStrings>
+      <replace>'</replace>
+    </replaceStrings>
+  </component>
+  <component name="IdeDocumentHistory">
+    <option name="CHANGED_PATHS">
+      <list>
+        <option value="$PROJECT_DIR$/lixiao/spiders/lixiaosknb.py" />
+      </list>
+    </option>
+  </component>
+  <component name="JsBuildToolGruntFileManager" detection-done="true" sorting="DEFINITION_ORDER" />
+  <component name="JsBuildToolPackageJson" detection-done="true" sorting="DEFINITION_ORDER" />
+  <component name="JsGulpfileManager">
+    <detection-done>true</detection-done>
+    <sorting>DEFINITION_ORDER</sorting>
+  </component>
+  <component name="ProjectFrameBounds" fullScreen="true">
+    <option name="x" value="1440" />
+    <option name="y" value="-220" />
+    <option name="width" value="2560" />
+    <option name="height" value="1440" />
+  </component>
+  <component name="ProjectView">
+    <navigator proportions="" version="1">
+      <foldersAlwaysOnTop value="true" />
+    </navigator>
+    <panes>
+      <pane id="Scope" />
+      <pane id="ProjectPane">
+        <subPane>
+          <expand>
+            <path>
+              <item name="lixiao" type="b2602c69:ProjectViewProjectNode" />
+              <item name="lixiao" type="462c0819:PsiDirectoryNode" />
+            </path>
+            <path>
+              <item name="lixiao" type="b2602c69:ProjectViewProjectNode" />
+              <item name="lixiao" type="462c0819:PsiDirectoryNode" />
+              <item name="lixiao" type="462c0819:PsiDirectoryNode" />
+            </path>
+            <path>
+              <item name="lixiao" type="b2602c69:ProjectViewProjectNode" />
+              <item name="lixiao" type="462c0819:PsiDirectoryNode" />
+              <item name="lixiao" type="462c0819:PsiDirectoryNode" />
+              <item name="spiders" type="462c0819:PsiDirectoryNode" />
+            </path>
+          </expand>
+          <select />
+        </subPane>
+      </pane>
+    </panes>
+  </component>
+  <component name="PropertiesComponent">
+    <property name="WebServerToolWindowFactoryState" value="false" />
+    <property name="last_opened_file_path" value="$PROJECT_DIR$" />
+    <property name="nodejs_interpreter_path.stuck_in_default_project" value="undefined stuck path" />
+    <property name="nodejs_npm_path_reset_for_default_project" value="true" />
+  </component>
+  <component name="RunDashboard">
+    <option name="ruleStates">
+      <list>
+        <RuleState>
+          <option name="name" value="ConfigurationTypeDashboardGroupingRule" />
+        </RuleState>
+        <RuleState>
+          <option name="name" value="StatusDashboardGroupingRule" />
+        </RuleState>
+      </list>
+    </option>
+  </component>
+  <component name="SvnConfiguration">
+    <configuration />
+  </component>
+  <component name="TaskManager">
+    <task active="true" id="Default" summary="Default task">
+      <changelist id="41630725-fc3d-404c-bd38-79fa94d4509b" name="Default Changelist" comment="" />
+      <created>1542850742480</created>
+      <option name="number" value="Default" />
+      <option name="presentableId" value="Default" />
+      <updated>1542850742480</updated>
+    </task>
+    <servers />
+  </component>
+  <component name="ToolWindowManager">
+    <frame x="1440" y="-220" width="2560" height="1440" extended-state="0" />
+    <editor active="true" />
+    <layout>
+      <window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.25972995" />
+      <window_info id="Structure" order="1" side_tool="true" weight="0.25" />
+      <window_info id="Favorites" order="2" side_tool="true" />
+      <window_info anchor="bottom" id="Message" order="0" />
+      <window_info anchor="bottom" id="Find" order="1" />
+      <window_info anchor="bottom" id="Run" order="2" />
+      <window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
+      <window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
+      <window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
+      <window_info anchor="bottom" id="TODO" order="6" />
+      <window_info anchor="bottom" id="Docker" order="7" show_stripe_button="false" />
+      <window_info anchor="bottom" id="Version Control" order="8" show_stripe_button="false" />
+      <window_info anchor="bottom" id="Database Changes" order="9" show_stripe_button="false" />
+      <window_info anchor="bottom" id="Event Log" order="10" side_tool="true" />
+      <window_info anchor="bottom" id="Terminal" order="11" />
+      <window_info anchor="bottom" id="Python Console" order="12" />
+      <window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
+      <window_info anchor="right" id="Ant Build" order="1" weight="0.25" />
+      <window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
+      <window_info anchor="right" id="SciView" order="3" />
+      <window_info anchor="right" id="Database" order="4" />
+    </layout>
+  </component>
+  <component name="TypeScriptGeneratedFilesManager">
+    <option name="version" value="1" />
+  </component>
+  <component name="VcsContentAnnotationSettings">
+    <option name="myLimit" value="2678400000" />
+  </component>
+  <component name="editorHistoryManager">
+    <entry file="file://$PROJECT_DIR$/lixiao/spiders/__init__.py">
+      <provider selected="true" editor-type-id="text-editor" />
+    </entry>
+    <entry file="file://$PROJECT_DIR$/lixiao/items.py">
+      <provider selected="true" editor-type-id="text-editor" />
+    </entry>
+    <entry file="file://$PROJECT_DIR$/lixiao/spiders/lixiaosknb.py">
+      <provider selected="true" editor-type-id="text-editor">
+        <state relative-caret-position="748">
+          <caret line="250" column="45" lean-forward="true" selection-start-line="250" selection-start-column="45" selection-end-line="250" selection-end-column="45" />
+          <folding>
+            <element signature="e#24#37#0" expanded="true" />
+            <marker date="1549942144736" expanded="true" signature="262:263" ph="..." />
+            <marker date="1549942144736" expanded="true" signature="6262:6266" ph="..." />
+          </folding>
+        </state>
+      </provider>
+    </entry>
+  </component>
+</project>

+ 0 - 0
items.json


+ 0 - 0
lixiao/__init__.py


二进制
lixiao/__init__.pyc


+ 14 - 0
lixiao/items.py

xqd
@@ -0,0 +1,14 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your scraped items
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/items.html
+
+import scrapy
+
+
+class LixiaoItem(scrapy.Item):
+    # define the fields for your item here like:
+    # name = scrapy.Field()
+    pass

+ 103 - 0
lixiao/middlewares.py

xqd
@@ -0,0 +1,103 @@
+# -*- coding: utf-8 -*-
+
+# Define here the models for your spider middleware
+#
+# See documentation in:
+# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+from scrapy import signals
+
+
+class LixiaoSpiderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the spider middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_spider_input(self, response, spider):
+        # Called for each response that goes through the spider
+        # middleware and into the spider.
+
+        # Should return None or raise an exception.
+        return None
+
+    def process_spider_output(self, response, result, spider):
+        # Called with the results returned from the Spider, after
+        # it has processed the response.
+
+        # Must return an iterable of Request, dict or Item objects.
+        for i in result:
+            yield i
+
+    def process_spider_exception(self, response, exception, spider):
+        # Called when a spider or process_spider_input() method
+        # (from other spider middleware) raises an exception.
+
+        # Should return either None or an iterable of Response, dict
+        # or Item objects.
+        pass
+
+    def process_start_requests(self, start_requests, spider):
+        # Called with the start requests of the spider, and works
+        # similarly to the process_spider_output() method, except
+        # that it doesn’t have a response associated.
+
+        # Must return only requests (not items).
+        for r in start_requests:
+            yield r
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)
+
+
+class LixiaoDownloaderMiddleware(object):
+    # Not all methods need to be defined. If a method is not defined,
+    # scrapy acts as if the downloader middleware does not modify the
+    # passed objects.
+
+    @classmethod
+    def from_crawler(cls, crawler):
+        # This method is used by Scrapy to create your spiders.
+        s = cls()
+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
+        return s
+
+    def process_request(self, request, spider):
+        # Called for each request that goes through the downloader
+        # middleware.
+
+        # Must either:
+        # - return None: continue processing this request
+        # - or return a Response object
+        # - or return a Request object
+        # - or raise IgnoreRequest: process_exception() methods of
+        #   installed downloader middleware will be called
+        return None
+
+    def process_response(self, request, response, spider):
+        # Called with the response returned from the downloader.
+
+        # Must either;
+        # - return a Response object
+        # - return a Request object
+        # - or raise IgnoreRequest
+        return response
+
+    def process_exception(self, request, exception, spider):
+        # Called when a download handler or a process_request()
+        # (from other downloader middleware) raises an exception.
+
+        # Must either:
+        # - return None: continue processing this exception
+        # - return a Response object: stops process_exception() chain
+        # - return a Request object: stops process_exception() chain
+        pass
+
+    def spider_opened(self, spider):
+        spider.logger.info('Spider opened: %s' % spider.name)

+ 11 - 0
lixiao/pipelines.py

xqd
@@ -0,0 +1,11 @@
+# -*- coding: utf-8 -*-
+
+# Define your item pipelines here
+#
+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
+# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+
+
+class LixiaoPipeline(object):
+    def process_item(self, item, spider):
+        return item

+ 90 - 0
lixiao/settings.py

xqd
@@ -0,0 +1,90 @@
+# -*- coding: utf-8 -*-
+
+# Scrapy settings for lixiao project
+#
+# For simplicity, this file contains only settings considered important or
+# commonly used. You can find more settings consulting the documentation:
+#
+#     https://doc.scrapy.org/en/latest/topics/settings.html
+#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+
+BOT_NAME = 'lixiao'
+
+SPIDER_MODULES = ['lixiao.spiders']
+NEWSPIDER_MODULE = 'lixiao.spiders'
+
+
+# Crawl responsibly by identifying yourself (and your website) on the user-agent
+#USER_AGENT = 'lixiao (+http://www.yourdomain.com)'
+
+# Obey robots.txt rules
+ROBOTSTXT_OBEY = True
+
+# Configure maximum concurrent requests performed by Scrapy (default: 16)
+#CONCURRENT_REQUESTS = 32
+
+# Configure a delay for requests for the same website (default: 0)
+# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
+# See also autothrottle settings and docs
+#DOWNLOAD_DELAY = 3
+# The download delay setting will honor only one of:
+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
+#CONCURRENT_REQUESTS_PER_IP = 16
+
+# Disable cookies (enabled by default)
+#COOKIES_ENABLED = False
+
+# Disable Telnet Console (enabled by default)
+#TELNETCONSOLE_ENABLED = False
+
+# Override the default request headers:
+#DEFAULT_REQUEST_HEADERS = {
+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
+#   'Accept-Language': 'en',
+#}
+
+# Enable or disable spider middlewares
+# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
+#SPIDER_MIDDLEWARES = {
+#    'lixiao.middlewares.LixiaoSpiderMiddleware': 543,
+#}
+
+# Enable or disable downloader middlewares
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
+#DOWNLOADER_MIDDLEWARES = {
+#    'lixiao.middlewares.LixiaoDownloaderMiddleware': 543,
+#}
+
+# Enable or disable extensions
+# See https://doc.scrapy.org/en/latest/topics/extensions.html
+#EXTENSIONS = {
+#    'scrapy.extensions.telnet.TelnetConsole': None,
+#}
+
+# Configure item pipelines
+# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
+#ITEM_PIPELINES = {
+#    'lixiao.pipelines.LixiaoPipeline': 300,
+#}
+
+# Enable and configure the AutoThrottle extension (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
+#AUTOTHROTTLE_ENABLED = True
+# The initial download delay
+#AUTOTHROTTLE_START_DELAY = 5
+# The maximum download delay to be set in case of high latencies
+#AUTOTHROTTLE_MAX_DELAY = 60
+# The average number of requests Scrapy should be sending in parallel to
+# each remote server
+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
+# Enable showing throttling stats for every response received:
+#AUTOTHROTTLE_DEBUG = False
+
+# Enable and configure HTTP caching (disabled by default)
+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
+#HTTPCACHE_ENABLED = True
+#HTTPCACHE_EXPIRATION_SECS = 0
+#HTTPCACHE_DIR = 'httpcache'
+#HTTPCACHE_IGNORE_HTTP_CODES = []
+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

二进制
lixiao/settings.pyc


+ 4 - 0
lixiao/spiders/__init__.py

xqd
@@ -0,0 +1,4 @@
+# This package will contain the spiders of your Scrapy project
+#
+# Please refer to the documentation for information on how to create and manage
+# your spiders.

二进制
lixiao/spiders/__init__.pyc


+ 261 - 0
lixiao/spiders/lixiaosknb.py

xqd
@@ -0,0 +1,261 @@
+# -*- coding: utf-8 -*-
+import scrapy
+import json
+import pymysql.cursors
+import time
+import sys
+import random
+
+reload(sys)
+sys.setdefaultencoding('utf8')
+
+
+class LixiaosknbSpider(scrapy.Spider):
+    name = 'lixiaosknb'
+    allowed_domains = ['biz.lixiaoskb.com']
+    start_urls = ['https://biz.lixiaoskb.com/login']
+
+    headers = {
+        'Connection': 'keep-alive',  # 保持链接状态
+        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36',
+        'Content-Type': 'application/json'
+    }
+
+    global company_id
+
+    def __init__(self):
+        self.file = open('items.json', 'wb')
+        self.connect = pymysql.connect(host='localhost',
+                                       user='root',
+                                       password='root',
+                                       db='swdz_crm',
+                                       charset='utf8mb4',
+                                       cursorclass=pymysql.cursors.DictCursor)
+
+        self.cursor = self.connect.cursor()
+
+    def parse(self, response):
+        yield scrapy.FormRequest(
+            url='https://biz.lixiaoskb.com/api/user/login',
+            formdata={'username': '17381599246', 'password': 'yj870102722'},
+            callback=self.after_login
+        )
+
+    def after_login(self, response):
+        res = json.loads(response.body)
+        print(res['data'])
+        token = res['data']['token']
+        self.headers['Authorization'] = token.encode('utf-8')
+
+        for i in range(1, 2):
+            myFormData = {
+                "keyword": "",
+                "filter": "{\"location\":[\"5101\"],\"industryshort\":[],\"registercapital\":\"0\",\"establishment\":\"0\",\"entstatus\":\"0\",\"enttype\":\"0\"}",
+                "scope": "",
+                "pagesize": "50",
+                "page": i
+            }
+            print(i)
+            if i % 10 == 0:
+                time.sleep(random.randint(5, 30))
+
+            time.sleep(0.5)
+            yield scrapy.Request('https://biz.lixiaoskb.com/api/opensearch/search',
+                                 method="POST",
+                                 body=json.dumps(myFormData),
+                                 headers=self.headers,
+                                 callback=self.parse_item)
+
+    def parse_item(self, response):
+        body = json.loads(response.body)
+        items = body['data']['items']
+        i = 0
+        for item in items:
+            i = i+1
+            id = item['id']
+            market_company = item['value']
+
+            time.sleep(random.random())
+            if i % 10 == 0:
+                time.sleep(random.randint(5, 10))
+
+            # 获取基本信息
+            self.headers['Referer'] = 'https://biz.lixiaoskb.com/report/' + id + '?company=' + market_company + '&source=search'
+            yield scrapy.Request(
+                'https://biz.lixiaoskb.com/api/opensearch/marketReport?id=' + id + '&market_company=' + market_company + '&market_source=search',
+                headers=self.headers,
+                meta={'id': id, 'name': market_company},
+                callback=self.get_baseinfo
+            )
+
+    # 获取基本信息
+    def get_baseinfo(self, response):
+        body = json.loads(response.body)
+        id = response.meta['id']
+        name = response.meta['name']
+        data = body['data']['baseinfo']
+
+        annual_date = self.format_time(data['apprdate'])
+        start_date = self.format_time(data['esdate'])
+        opfrom = data['opfrom'] if data.has_key('opfrom') else ''
+        opto = data['opto'] if data.has_key('opto') else ''
+        officialWebsite = data['officialWebsite'] if data.has_key('officialWebsite') else ''
+        regno = data['regno'] if data.has_key('regno') else ''
+        legalperson = data['legalperson'] if data.has_key('legalperson') else ''
+        address = data['address'] if data.has_key('address') else ''
+        businessscope = data['businessscope'] if data.has_key('businessscope') else ''
+        entstatus = data['entstatus'] if data.has_key('entstatus') else ''
+        enttype = data['enttype'] if data.has_key('enttype') else ''
+        industry = data['industry'] if data.has_key('industry') else ''
+        regcap = data['regcap'] if data.has_key('regcap') else ''
+
+        opentime = self.format_time(opfrom) + '-' + self.format_time(opto)
+        self.cursor.execute(
+            """insert into company_info(company_name, reg_no, legal_person, reg_addr, scope, open_status, ent_type, industry, annual_date,start_date,open_time,reg_capital,website)
+            value (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""",
+            (data['entname'], regno, legalperson, address, businessscope,
+             entstatus, enttype, industry, annual_date, start_date, opentime, regcap,
+             officialWebsite))
+        # 提交sql语句
+        self.connect.commit()
+        company_id = self.cursor.lastrowid
+
+        time.sleep(0.5)
+
+        self.headers['Referer'] = 'https://biz.lixiaoskb.com/report/' + id + '?company=' + name + '&source=search'
+        # 获取旗下网站信息
+        yield scrapy.Request(
+            'https://biz.lixiaoskb.com/api/opensearch/getDomains?id=' + id,
+            headers=self.headers,
+            meta={'company_id': company_id},
+            callback=self.get_webs
+        )
+
+        time.sleep(0.5)
+        # 获取股东信息
+        yield scrapy.Request(
+            'https://biz.lixiaoskb.com/api/opensearch/querySpecifiedSiteInfo?id=' + id + '&tag=3',
+            headers=self.headers,
+            meta={'company_id': company_id},
+            callback=self.get_shareholders
+        )
+
+        time.sleep(0.5)
+        # 获取招聘
+        yield scrapy.Request(
+            'https://biz.lixiaoskb.com/api/opensearch/querySpecifiedSiteInfo?id=' + id + '&tag=4',
+            headers=self.headers,
+            meta={'company_id': company_id, 'offset': 1, 'id': id, 'name': name},
+            callback=self.get_jobs
+        )
+
+        time.sleep(0.5)
+        # 获取网络推广
+        yield scrapy.Request(
+            'https://biz.lixiaoskb.com/api/opensearch/querySpecifiedSiteInfo?id=' + id + '&tag=5',
+            headers=self.headers,
+            meta={'company_id': company_id, 'offset': 1, 'id': id, 'name': name},
+            callback=self.get_sem
+        )
+
+    # 获取网络推广
+    def get_sem(self, response):
+        body = json.loads(response.body)
+        company_id = response.meta['company_id']
+        offset = response.meta['offset']
+        id = response.meta['id']
+        name = response.meta['name']
+        items = body['data']['item']
+        if items:
+            for item in items:
+                keywords = ','.join(item['semKeywords'])
+                self.cursor.execute(
+                    """insert into company_sems(company_id, sem_date, sem_title, sem_url, keywords, source_name)
+                    value (%s, %s, %s, %s, %s, %s)""",
+                    (company_id, item['semDate'], item['semTitle'], item['semUrl'], keywords, item['sourceName'],
+                     ))
+                # 提交sql语句
+                self.connect.commit()
+
+            time.sleep(1)
+            offset = offset + 1
+            if offset % 5 == 0:
+                time.sleep(5)
+
+            self.headers['Referer'] = 'https://biz.lixiaoskb.com/report/' + id + '?company=' + name + '&source=search'
+            yield scrapy.Request(
+                'https://biz.lixiaoskb.com/api/opensearch/querySpecifiedSiteInfo?id=' + str(
+                    id) + '&tag=5' + '&offset=' + str(offset),
+                headers=self.headers,
+                meta={'company_id': company_id, 'offset': offset, 'id': id, 'name':name},
+                callback=self.get_sem
+            )
+
+    # 获取旗下网站信息
+    def get_webs(self, response):
+        body = json.loads(response.body)
+        company_id = response.meta['company_id']
+        items = body['data']['items']
+        if items:
+            for item in items:
+                self.cursor.execute(
+                    """insert into company_domains(company_id, site_domain, site_home, site_name)
+                    value (%s, %s, %s, %s)""",
+                    (company_id, item['SITEDOMAIN'], item['SITEHOME'], item['SITENAME']))
+                # 提交sql语句
+                self.connect.commit()
+
+    # 获取股东信息
+    def get_shareholders(self, response):
+        body = json.loads(response.body)
+        company_id = response.meta['company_id']
+        if body['data'].has_key('BE_INVEST'):
+            items = body['data']['BE_INVEST']['data']
+            if items:
+                for item in items:
+                    self.cursor.execute(
+                        """insert into company_shareholders(company_id, username, insto, amount, amount_paid)
+                        value (%s, %s, %s, %s, %s)""",
+                        (company_id, item['INV'], item['INSTO'], item['LIACCONAM'], item['LISUBCONAM']))
+                    # 提交sql语句
+                    self.connect.commit()
+
+    # 获取招聘
+    def get_jobs(self, response):
+        body = json.loads(response.body)
+        company_id = response.meta['company_id']
+        offset = response.meta['offset']
+        id = response.meta['id']
+        name = response.meta['name']
+        items = body['data']['item']
+        if items:
+            for item in items:
+                self.cursor.execute(
+                    """insert into company_jobs(company_id, job_name, location, salary, job_url, release_date, source_name)
+                    value (%s, %s, %s, %s, %s, %s, %s)""",
+                    (company_id, item['jobName'], item['location'], item['salary'], item['jobUrl'], item['releaseDate'],
+                     item['sourceName']))
+                # 提交sql语句
+                self.connect.commit()
+
+            time.sleep(1)
+            offset = offset + 1
+            if offset % 5 == 0:
+                time.sleep(5)
+
+            self.headers['Referer'] = 'https://biz.lixiaoskb.com/report/' + id + '?company=' + name + '&source=search'
+            yield scrapy.Request(
+                'https://biz.lixiaoskb.com/api/opensearch/querySpecifiedSiteInfo?id=' + str(
+                    id) + '&tag=4' + '&offset=' + str(offset),
+                headers=self.headers,
+                meta={'company_id': company_id, 'offset': offset, 'id': id, 'name': name},
+                callback=self.get_jobs
+            )
+
+    # 将时间格式化为时间戳
+    def format_time(self, mytime):
+        if mytime:
+            timeStamp = float(mytime / 1000)
+            timeArray = time.localtime(timeStamp)
+            return time.strftime("%Y-%m-%d", timeArray)
+        return ''

二进制
lixiao/spiders/lixiaosknb.pyc


+ 11 - 0
scrapy.cfg

xqd
@@ -0,0 +1,11 @@
+# Automatically created by: scrapy startproject
+#
+# For more information about the [deploy] section see:
+# https://scrapyd.readthedocs.io/en/latest/deploy.html
+
+[settings]
+default = lixiao.settings
+
+[deploy]
+#url = http://localhost:6800/
+project = lixiao