6 年之前 · 52baae4fef
--- a/.idea/lixiao.iml
+++ b/.idea/lixiao.iml
@@ -0,0 +1,11 @@
 
				+<?xml version="1.0" encoding="UTF-8"?>
			
 
				+<module type="PYTHON_MODULE" version="4">
			
 
				+  <component name="NewModuleRootManager">
			
 
				+    <content url="file://$MODULE_DIR$" />
			
 
				+    <orderEntry type="inheritedJdk" />
			
 
				+    <orderEntry type="sourceFolder" forTests="false" />
			
 
				+  </component>
			
 
				+  <component name="TestRunnerService">
			
 
				+    <option name="PROJECT_TEST_RUNNER" value="Unittests" />
			
 
				+  </component>
			
 
				+</module>
			
--- a/.idea/misc.xml
+++ b/.idea/misc.xml
@@ -0,0 +1,7 @@
 
				+<?xml version="1.0" encoding="UTF-8"?>
			
 
				+<project version="4">
			
 
				+  <component name="JavaScriptSettings">
			
 
				+    <option name="languageLevel" value="ES6" />
			
 
				+  </component>
			
 
				+  <component name="ProjectRootManager" version="2" project-jdk-name="Python 3.6" project-jdk-type="Python SDK" />
			
 
				+</project>
			
--- a/.idea/modules.xml
+++ b/.idea/modules.xml
@@ -0,0 +1,8 @@
 
				+<?xml version="1.0" encoding="UTF-8"?>
			
 
				+<project version="4">
			
 
				+  <component name="ProjectModuleManager">
			
 
				+    <modules>
			
 
				+      <module fileurl="file://$PROJECT_DIR$/.idea/lixiao.iml" filepath="$PROJECT_DIR$/.idea/lixiao.iml" />
			
 
				+    </modules>
			
 
				+  </component>
			
 
				+</project>
			
--- a/.idea/workspace.xml
+++ b/.idea/workspace.xml
@@ -0,0 +1,216 @@
 
				+<?xml version="1.0" encoding="UTF-8"?>
			
 
				+<project version="4">
			
 
				+  <component name="ChangeListManager">
			
 
				+    <list default="true" id="41630725-fc3d-404c-bd38-79fa94d4509b" name="Default Changelist" comment="" />
			
 
				+    <option name="EXCLUDED_CONVERTED_TO_IGNORED" value="true" />
			
 
				+    <option name="SHOW_DIALOG" value="false" />
			
 
				+    <option name="HIGHLIGHT_CONFLICTS" value="true" />
			
 
				+    <option name="HIGHLIGHT_NON_ACTIVE_CHANGELIST" value="false" />
			
 
				+    <option name="LAST_RESOLUTION" value="IGNORE" />
			
 
				+  </component>
			
 
				+  <component name="FUSProjectUsageTrigger">
			
 
				+    <session id="74139605">
			
 
				+      <usages-collector id="statistics.lifecycle.project">
			
 
				+        <counts>
			
 
				+          <entry key="project.closed" value="6" />
			
 
				+          <entry key="project.open.time.3" value="1" />
			
 
				+          <entry key="project.open.time.4" value="2" />
			
 
				+          <entry key="project.open.time.5" value="1" />
			
 
				+          <entry key="project.open.time.6" value="2" />
			
 
				+          <entry key="project.opened" value="6" />
			
 
				+        </counts>
			
 
				+      </usages-collector>
			
 
				+      <usages-collector id="statistics.file.extensions.open">
			
 
				+        <counts>
			
 
				+          <entry key="py" value="3" />
			
 
				+        </counts>
			
 
				+      </usages-collector>
			
 
				+      <usages-collector id="statistics.file.types.open">
			
 
				+        <counts>
			
 
				+          <entry key="Python" value="3" />
			
 
				+        </counts>
			
 
				+      </usages-collector>
			
 
				+      <usages-collector id="statistics.file.extensions.edit">
			
 
				+        <counts>
			
 
				+          <entry key="py" value="4255" />
			
 
				+        </counts>
			
 
				+      </usages-collector>
			
 
				+      <usages-collector id="statistics.file.types.edit">
			
 
				+        <counts>
			
 
				+          <entry key="Python" value="4255" />
			
 
				+        </counts>
			
 
				+      </usages-collector>
			
 
				+    </session>
			
 
				+  </component>
			
 
				+  <component name="FileEditorManager">
			
 
				+    <leaf SIDE_TABS_SIZE_LIMIT_KEY="300">
			
 
				+      <file pinned="false" current-in-tab="false">
			
 
				+        <entry file="file://$PROJECT_DIR$/lixiao/spiders/__init__.py">
			
 
				+          <provider selected="true" editor-type-id="text-editor" />
			
 
				+        </entry>
			
 
				+      </file>
			
 
				+      <file pinned="false" current-in-tab="true">
			
 
				+        <entry file="file://$PROJECT_DIR$/lixiao/spiders/lixiaosknb.py">
			
 
				+          <provider selected="true" editor-type-id="text-editor">
			
 
				+            <state relative-caret-position="748">
			
 
				+              <caret line="250" column="45" lean-forward="true" selection-start-line="250" selection-start-column="45" selection-end-line="250" selection-end-column="45" />
			
 
				+              <folding>
			
 
				+                <element signature="e#24#37#0" expanded="true" />
			
 
				+                <marker date="1549942144736" expanded="true" signature="262:263" ph="..." />
			
 
				+                <marker date="1549942144736" expanded="true" signature="6262:6266" ph="..." />
			
 
				+              </folding>
			
 
				+            </state>
			
 
				+          </provider>
			
 
				+        </entry>
			
 
				+      </file>
			
 
				+      <file pinned="false" current-in-tab="false">
			
 
				+        <entry file="file://$PROJECT_DIR$/lixiao/items.py">
			
 
				+          <provider selected="true" editor-type-id="text-editor" />
			
 
				+        </entry>
			
 
				+      </file>
			
 
				+    </leaf>
			
 
				+  </component>
			
 
				+  <component name="FindInProjectRecents">
			
 
				+    <findStrings>
			
 
				+      <find>parse</find>
			
 
				+      <find>&quot;</find>
			
 
				+      <find>self.id</find>
			
 
				+    </findStrings>
			
 
				+    <replaceStrings>
			
 
				+      <replace>'</replace>
			
 
				+    </replaceStrings>
			
 
				+  </component>
			
 
				+  <component name="IdeDocumentHistory">
			
 
				+    <option name="CHANGED_PATHS">
			
 
				+      <list>
			
 
				+        <option value="$PROJECT_DIR$/lixiao/spiders/lixiaosknb.py" />
			
 
				+      </list>
			
 
				+    </option>
			
 
				+  </component>
			
 
				+  <component name="JsBuildToolGruntFileManager" detection-done="true" sorting="DEFINITION_ORDER" />
			
 
				+  <component name="JsBuildToolPackageJson" detection-done="true" sorting="DEFINITION_ORDER" />
			
 
				+  <component name="JsGulpfileManager">
			
 
				+    <detection-done>true</detection-done>
			
 
				+    <sorting>DEFINITION_ORDER</sorting>
			
 
				+  </component>
			
 
				+  <component name="ProjectFrameBounds" fullScreen="true">
			
 
				+    <option name="x" value="1440" />
			
 
				+    <option name="y" value="-220" />
			
 
				+    <option name="width" value="2560" />
			
 
				+    <option name="height" value="1440" />
			
 
				+  </component>
			
 
				+  <component name="ProjectView">
			
 
				+    <navigator proportions="" version="1">
			
 
				+      <foldersAlwaysOnTop value="true" />
			
 
				+    </navigator>
			
 
				+    <panes>
			
 
				+      <pane id="Scope" />
			
 
				+      <pane id="ProjectPane">
			
 
				+        <subPane>
			
 
				+          <expand>
			
 
				+            <path>
			
 
				+              <item name="lixiao" type="b2602c69:ProjectViewProjectNode" />
			
 
				+              <item name="lixiao" type="462c0819:PsiDirectoryNode" />
			
 
				+            </path>
			
 
				+            <path>
			
 
				+              <item name="lixiao" type="b2602c69:ProjectViewProjectNode" />
			
 
				+              <item name="lixiao" type="462c0819:PsiDirectoryNode" />
			
 
				+              <item name="lixiao" type="462c0819:PsiDirectoryNode" />
			
 
				+            </path>
			
 
				+            <path>
			
 
				+              <item name="lixiao" type="b2602c69:ProjectViewProjectNode" />
			
 
				+              <item name="lixiao" type="462c0819:PsiDirectoryNode" />
			
 
				+              <item name="lixiao" type="462c0819:PsiDirectoryNode" />
			
 
				+              <item name="spiders" type="462c0819:PsiDirectoryNode" />
			
 
				+            </path>
			
 
				+          </expand>
			
 
				+          <select />
			
 
				+        </subPane>
			
 
				+      </pane>
			
 
				+    </panes>
			
 
				+  </component>
			
 
				+  <component name="PropertiesComponent">
			
 
				+    <property name="WebServerToolWindowFactoryState" value="false" />
			
 
				+    <property name="last_opened_file_path" value="$PROJECT_DIR$" />
			
 
				+    <property name="nodejs_interpreter_path.stuck_in_default_project" value="undefined stuck path" />
			
 
				+    <property name="nodejs_npm_path_reset_for_default_project" value="true" />
			
 
				+  </component>
			
 
				+  <component name="RunDashboard">
			
 
				+    <option name="ruleStates">
			
 
				+      <list>
			
 
				+        <RuleState>
			
 
				+          <option name="name" value="ConfigurationTypeDashboardGroupingRule" />
			
 
				+        </RuleState>
			
 
				+        <RuleState>
			
 
				+          <option name="name" value="StatusDashboardGroupingRule" />
			
 
				+        </RuleState>
			
 
				+      </list>
			
 
				+    </option>
			
 
				+  </component>
			
 
				+  <component name="SvnConfiguration">
			
 
				+    <configuration />
			
 
				+  </component>
			
 
				+  <component name="TaskManager">
			
 
				+    <task active="true" id="Default" summary="Default task">
			
 
				+      <changelist id="41630725-fc3d-404c-bd38-79fa94d4509b" name="Default Changelist" comment="" />
			
 
				+      <created>1542850742480</created>
			
 
				+      <option name="number" value="Default" />
			
 
				+      <option name="presentableId" value="Default" />
			
 
				+      <updated>1542850742480</updated>
			
 
				+    </task>
			
 
				+    <servers />
			
 
				+  </component>
			
 
				+  <component name="ToolWindowManager">
			
 
				+    <frame x="1440" y="-220" width="2560" height="1440" extended-state="0" />
			
 
				+    <editor active="true" />
			
 
				+    <layout>
			
 
				+      <window_info active="true" content_ui="combo" id="Project" order="0" visible="true" weight="0.25972995" />
			
 
				+      <window_info id="Structure" order="1" side_tool="true" weight="0.25" />
			
 
				+      <window_info id="Favorites" order="2" side_tool="true" />
			
 
				+      <window_info anchor="bottom" id="Message" order="0" />
			
 
				+      <window_info anchor="bottom" id="Find" order="1" />
			
 
				+      <window_info anchor="bottom" id="Run" order="2" />
			
 
				+      <window_info anchor="bottom" id="Debug" order="3" weight="0.4" />
			
 
				+      <window_info anchor="bottom" id="Cvs" order="4" weight="0.25" />
			
 
				+      <window_info anchor="bottom" id="Inspection" order="5" weight="0.4" />
			
 
				+      <window_info anchor="bottom" id="TODO" order="6" />
			
 
				+      <window_info anchor="bottom" id="Docker" order="7" show_stripe_button="false" />
			
 
				+      <window_info anchor="bottom" id="Version Control" order="8" show_stripe_button="false" />
			
 
				+      <window_info anchor="bottom" id="Database Changes" order="9" show_stripe_button="false" />
			
 
				+      <window_info anchor="bottom" id="Event Log" order="10" side_tool="true" />
			
 
				+      <window_info anchor="bottom" id="Terminal" order="11" />
			
 
				+      <window_info anchor="bottom" id="Python Console" order="12" />
			
 
				+      <window_info anchor="right" id="Commander" internal_type="SLIDING" order="0" type="SLIDING" weight="0.4" />
			
 
				+      <window_info anchor="right" id="Ant Build" order="1" weight="0.25" />
			
 
				+      <window_info anchor="right" content_ui="combo" id="Hierarchy" order="2" weight="0.25" />
			
 
				+      <window_info anchor="right" id="SciView" order="3" />
			
 
				+      <window_info anchor="right" id="Database" order="4" />
			
 
				+    </layout>
			
 
				+  </component>
			
 
				+  <component name="TypeScriptGeneratedFilesManager">
			
 
				+    <option name="version" value="1" />
			
 
				+  </component>
			
 
				+  <component name="VcsContentAnnotationSettings">
			
 
				+    <option name="myLimit" value="2678400000" />
			
 
				+  </component>
			
 
				+  <component name="editorHistoryManager">
			
 
				+    <entry file="file://$PROJECT_DIR$/lixiao/spiders/__init__.py">
			
 
				+      <provider selected="true" editor-type-id="text-editor" />
			
 
				+    </entry>
			
 
				+    <entry file="file://$PROJECT_DIR$/lixiao/items.py">
			
 
				+      <provider selected="true" editor-type-id="text-editor" />
			
 
				+    </entry>
			
 
				+    <entry file="file://$PROJECT_DIR$/lixiao/spiders/lixiaosknb.py">
			
 
				+      <provider selected="true" editor-type-id="text-editor">
			
 
				+        <state relative-caret-position="748">
			
 
				+          <caret line="250" column="45" lean-forward="true" selection-start-line="250" selection-start-column="45" selection-end-line="250" selection-end-column="45" />
			
 
				+          <folding>
			
 
				+            <element signature="e#24#37#0" expanded="true" />
			
 
				+            <marker date="1549942144736" expanded="true" signature="262:263" ph="..." />
			
 
				+            <marker date="1549942144736" expanded="true" signature="6262:6266" ph="..." />
			
 
				+          </folding>
			
 
				+        </state>
			
 
				+      </provider>
			
 
				+    </entry>
			
 
				+  </component>
			
 
				+</project>
			
--- a/items.json
+++ b/items.json
--- a/lixiao/__init__.py
+++ b/lixiao/__init__.py
--- a/lixiao/__init__.pyc
+++ b/lixiao/__init__.pyc
--- a/lixiao/items.py
+++ b/lixiao/items.py
@@ -0,0 +1,14 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+
			
 
				+# Define here the models for your scraped items
			
 
				+#
			
 
				+# See documentation in:
			
 
				+# https://doc.scrapy.org/en/latest/topics/items.html
			
 
				+
			
 
				+import scrapy
			
 
				+
			
 
				+
			
 
				+class LixiaoItem(scrapy.Item):
			
 
				+    # define the fields for your item here like:
			
 
				+    # name = scrapy.Field()
			
 
				+    pass
			
--- a/lixiao/middlewares.py
+++ b/lixiao/middlewares.py
@@ -0,0 +1,103 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+
			
 
				+# Define here the models for your spider middleware
			
 
				+#
			
 
				+# See documentation in:
			
 
				+# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
			
 
				+
			
 
				+from scrapy import signals
			
 
				+
			
 
				+
			
 
				+class LixiaoSpiderMiddleware(object):
			
 
				+    # Not all methods need to be defined. If a method is not defined,
			
 
				+    # scrapy acts as if the spider middleware does not modify the
			
 
				+    # passed objects.
			
 
				+
			
 
				+    @classmethod
			
 
				+    def from_crawler(cls, crawler):
			
 
				+        # This method is used by Scrapy to create your spiders.
			
 
				+        s = cls()
			
 
				+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
			
 
				+        return s
			
 
				+
			
 
				+    def process_spider_input(self, response, spider):
			
 
				+        # Called for each response that goes through the spider
			
 
				+        # middleware and into the spider.
			
 
				+
			
 
				+        # Should return None or raise an exception.
			
 
				+        return None
			
 
				+
			
 
				+    def process_spider_output(self, response, result, spider):
			
 
				+        # Called with the results returned from the Spider, after
			
 
				+        # it has processed the response.
			
 
				+
			
 
				+        # Must return an iterable of Request, dict or Item objects.
			
 
				+        for i in result:
			
 
				+            yield i
			
 
				+
			
 
				+    def process_spider_exception(self, response, exception, spider):
			
 
				+        # Called when a spider or process_spider_input() method
			
 
				+        # (from other spider middleware) raises an exception.
			
 
				+
			
 
				+        # Should return either None or an iterable of Response, dict
			
 
				+        # or Item objects.
			
 
				+        pass
			
 
				+
			
 
				+    def process_start_requests(self, start_requests, spider):
			
 
				+        # Called with the start requests of the spider, and works
			
 
				+        # similarly to the process_spider_output() method, except
			
 
				+        # that it doesn’t have a response associated.
			
 
				+
			
 
				+        # Must return only requests (not items).
			
 
				+        for r in start_requests:
			
 
				+            yield r
			
 
				+
			
 
				+    def spider_opened(self, spider):
			
 
				+        spider.logger.info('Spider opened: %s' % spider.name)
			
 
				+
			
 
				+
			
 
				+class LixiaoDownloaderMiddleware(object):
			
 
				+    # Not all methods need to be defined. If a method is not defined,
			
 
				+    # scrapy acts as if the downloader middleware does not modify the
			
 
				+    # passed objects.
			
 
				+
			
 
				+    @classmethod
			
 
				+    def from_crawler(cls, crawler):
			
 
				+        # This method is used by Scrapy to create your spiders.
			
 
				+        s = cls()
			
 
				+        crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
			
 
				+        return s
			
 
				+
			
 
				+    def process_request(self, request, spider):
			
 
				+        # Called for each request that goes through the downloader
			
 
				+        # middleware.
			
 
				+
			
 
				+        # Must either:
			
 
				+        # - return None: continue processing this request
			
 
				+        # - or return a Response object
			
 
				+        # - or return a Request object
			
 
				+        # - or raise IgnoreRequest: process_exception() methods of
			
 
				+        #   installed downloader middleware will be called
			
 
				+        return None
			
 
				+
			
 
				+    def process_response(self, request, response, spider):
			
 
				+        # Called with the response returned from the downloader.
			
 
				+
			
 
				+        # Must either;
			
 
				+        # - return a Response object
			
 
				+        # - return a Request object
			
 
				+        # - or raise IgnoreRequest
			
 
				+        return response
			
 
				+
			
 
				+    def process_exception(self, request, exception, spider):
			
 
				+        # Called when a download handler or a process_request()
			
 
				+        # (from other downloader middleware) raises an exception.
			
 
				+
			
 
				+        # Must either:
			
 
				+        # - return None: continue processing this exception
			
 
				+        # - return a Response object: stops process_exception() chain
			
 
				+        # - return a Request object: stops process_exception() chain
			
 
				+        pass
			
 
				+
			
 
				+    def spider_opened(self, spider):
			
 
				+        spider.logger.info('Spider opened: %s' % spider.name)
			
--- a/lixiao/pipelines.py
+++ b/lixiao/pipelines.py
@@ -0,0 +1,11 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+
			
 
				+# Define your item pipelines here
			
 
				+#
			
 
				+# Don't forget to add your pipeline to the ITEM_PIPELINES setting
			
 
				+# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
			
 
				+
			
 
				+
			
 
				+class LixiaoPipeline(object):
			
 
				+    def process_item(self, item, spider):
			
 
				+        return item
			
--- a/lixiao/settings.py
+++ b/lixiao/settings.py
@@ -0,0 +1,90 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+
			
 
				+# Scrapy settings for lixiao project
			
 
				+#
			
 
				+# For simplicity, this file contains only settings considered important or
			
 
				+# commonly used. You can find more settings consulting the documentation:
			
 
				+#
			
 
				+#     https://doc.scrapy.org/en/latest/topics/settings.html
			
 
				+#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
			
 
				+#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
			
 
				+
			
 
				+BOT_NAME = 'lixiao'
			
 
				+
			
 
				+SPIDER_MODULES = ['lixiao.spiders']
			
 
				+NEWSPIDER_MODULE = 'lixiao.spiders'
			
 
				+
			
 
				+
			
 
				+# Crawl responsibly by identifying yourself (and your website) on the user-agent
			
 
				+#USER_AGENT = 'lixiao (+http://www.yourdomain.com)'
			
 
				+
			
 
				+# Obey robots.txt rules
			
 
				+ROBOTSTXT_OBEY = True
			
 
				+
			
 
				+# Configure maximum concurrent requests performed by Scrapy (default: 16)
			
 
				+#CONCURRENT_REQUESTS = 32
			
 
				+
			
 
				+# Configure a delay for requests for the same website (default: 0)
			
 
				+# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
			
 
				+# See also autothrottle settings and docs
			
 
				+#DOWNLOAD_DELAY = 3
			
 
				+# The download delay setting will honor only one of:
			
 
				+#CONCURRENT_REQUESTS_PER_DOMAIN = 16
			
 
				+#CONCURRENT_REQUESTS_PER_IP = 16
			
 
				+
			
 
				+# Disable cookies (enabled by default)
			
 
				+#COOKIES_ENABLED = False
			
 
				+
			
 
				+# Disable Telnet Console (enabled by default)
			
 
				+#TELNETCONSOLE_ENABLED = False
			
 
				+
			
 
				+# Override the default request headers:
			
 
				+#DEFAULT_REQUEST_HEADERS = {
			
 
				+#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
			
 
				+#   'Accept-Language': 'en',
			
 
				+#}
			
 
				+
			
 
				+# Enable or disable spider middlewares
			
 
				+# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
			
 
				+#SPIDER_MIDDLEWARES = {
			
 
				+#    'lixiao.middlewares.LixiaoSpiderMiddleware': 543,
			
 
				+#}
			
 
				+
			
 
				+# Enable or disable downloader middlewares
			
 
				+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
			
 
				+#DOWNLOADER_MIDDLEWARES = {
			
 
				+#    'lixiao.middlewares.LixiaoDownloaderMiddleware': 543,
			
 
				+#}
			
 
				+
			
 
				+# Enable or disable extensions
			
 
				+# See https://doc.scrapy.org/en/latest/topics/extensions.html
			
 
				+#EXTENSIONS = {
			
 
				+#    'scrapy.extensions.telnet.TelnetConsole': None,
			
 
				+#}
			
 
				+
			
 
				+# Configure item pipelines
			
 
				+# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
			
 
				+#ITEM_PIPELINES = {
			
 
				+#    'lixiao.pipelines.LixiaoPipeline': 300,
			
 
				+#}
			
 
				+
			
 
				+# Enable and configure the AutoThrottle extension (disabled by default)
			
 
				+# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
			
 
				+#AUTOTHROTTLE_ENABLED = True
			
 
				+# The initial download delay
			
 
				+#AUTOTHROTTLE_START_DELAY = 5
			
 
				+# The maximum download delay to be set in case of high latencies
			
 
				+#AUTOTHROTTLE_MAX_DELAY = 60
			
 
				+# The average number of requests Scrapy should be sending in parallel to
			
 
				+# each remote server
			
 
				+#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
			
 
				+# Enable showing throttling stats for every response received:
			
 
				+#AUTOTHROTTLE_DEBUG = False
			
 
				+
			
 
				+# Enable and configure HTTP caching (disabled by default)
			
 
				+# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
			
 
				+#HTTPCACHE_ENABLED = True
			
 
				+#HTTPCACHE_EXPIRATION_SECS = 0
			
 
				+#HTTPCACHE_DIR = 'httpcache'
			
 
				+#HTTPCACHE_IGNORE_HTTP_CODES = []
			
 
				+#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
			
--- a/lixiao/settings.pyc
+++ b/lixiao/settings.pyc
--- a/lixiao/spiders/__init__.py
+++ b/lixiao/spiders/__init__.py
@@ -0,0 +1,4 @@
 
				+# This package will contain the spiders of your Scrapy project
			
 
				+#
			
 
				+# Please refer to the documentation for information on how to create and manage
			
 
				+# your spiders.
			
--- a/lixiao/spiders/__init__.pyc
+++ b/lixiao/spiders/__init__.pyc
--- a/lixiao/spiders/lixiaosknb.py
+++ b/lixiao/spiders/lixiaosknb.py
@@ -0,0 +1,261 @@
 
				+# -*- coding: utf-8 -*-
			
 
				+import scrapy
			
 
				+import json
			
 
				+import pymysql.cursors
			
 
				+import time
			
 
				+import sys
			
 
				+import random
			
 
				+
			
 
				+reload(sys)
			
 
				+sys.setdefaultencoding('utf8')
			
 
				+
			
 
				+
			
 
				+class LixiaosknbSpider(scrapy.Spider):
			
 
				+    name = 'lixiaosknb'
			
 
				+    allowed_domains = ['biz.lixiaoskb.com']
			
 
				+    start_urls = ['https://biz.lixiaoskb.com/login']
			
 
				+
			
 
				+    headers = {
			
 
				+        'Connection': 'keep-alive',  # 保持链接状态
			
 
				+        'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.82 Safari/537.36',
			
 
				+        'Content-Type': 'application/json'
			
 
				+    }
			
 
				+
			
 
				+    global company_id
			
 
				+
			
 
				+    def __init__(self):
			
 
				+        self.file = open('items.json', 'wb')
			
 
				+        self.connect = pymysql.connect(host='localhost',
			
 
				+                                       user='root',
			
 
				+                                       password='root',
			
 
				+                                       db='swdz_crm',
			
 
				+                                       charset='utf8mb4',
			
 
				+                                       cursorclass=pymysql.cursors.DictCursor)
			
 
				+
			
 
				+        self.cursor = self.connect.cursor()
			
 
				+
			
 
				+    def parse(self, response):
			
 
				+        yield scrapy.FormRequest(
			
 
				+            url='https://biz.lixiaoskb.com/api/user/login',
			
 
				+            formdata={'username': '17381599246', 'password': 'yj870102722'},
			
 
				+            callback=self.after_login
			
 
				+        )
			
 
				+
			
 
				+    def after_login(self, response):
			
 
				+        res = json.loads(response.body)
			
 
				+        print(res['data'])
			
 
				+        token = res['data']['token']
			
 
				+        self.headers['Authorization'] = token.encode('utf-8')
			
 
				+
			
 
				+        for i in range(1, 2):
			
 
				+            myFormData = {
			
 
				+                "keyword": "",
			
 
				+                "filter": "{\"location\":[\"5101\"],\"industryshort\":[],\"registercapital\":\"0\",\"establishment\":\"0\",\"entstatus\":\"0\",\"enttype\":\"0\"}",
			
 
				+                "scope": "",
			
 
				+                "pagesize": "50",
			
 
				+                "page": i
			
 
				+            }
			
 
				+            print(i)
			
 
				+            if i % 10 == 0:
			
 
				+                time.sleep(random.randint(5, 30))
			
 
				+
			
 
				+            time.sleep(0.5)
			
 
				+            yield scrapy.Request('https://biz.lixiaoskb.com/api/opensearch/search',
			
 
				+                                 method="POST",
			
 
				+                                 body=json.dumps(myFormData),
			
 
				+                                 headers=self.headers,
			
 
				+                                 callback=self.parse_item)
			
 
				+
			
 
				+    def parse_item(self, response):
			
 
				+        body = json.loads(response.body)
			
 
				+        items = body['data']['items']
			
 
				+        i = 0
			
 
				+        for item in items:
			
 
				+            i = i+1
			
 
				+            id = item['id']
			
 
				+            market_company = item['value']
			
 
				+
			
 
				+            time.sleep(random.random())
			
 
				+            if i % 10 == 0:
			
 
				+                time.sleep(random.randint(5, 10))
			
 
				+
			
 
				+            # 获取基本信息
			
 
				+            self.headers['Referer'] = 'https://biz.lixiaoskb.com/report/' + id + '?company=' + market_company + '&source=search'
			
 
				+            yield scrapy.Request(
			
 
				+                'https://biz.lixiaoskb.com/api/opensearch/marketReport?id=' + id + '&market_company=' + market_company + '&market_source=search',
			
 
				+                headers=self.headers,
			
 
				+                meta={'id': id, 'name': market_company},
			
 
				+                callback=self.get_baseinfo
			
 
				+            )
			
 
				+
			
 
				+    # 获取基本信息
			
 
				+    def get_baseinfo(self, response):
			
 
				+        body = json.loads(response.body)
			
 
				+        id = response.meta['id']
			
 
				+        name = response.meta['name']
			
 
				+        data = body['data']['baseinfo']
			
 
				+
			
 
				+        annual_date = self.format_time(data['apprdate'])
			
 
				+        start_date = self.format_time(data['esdate'])
			
 
				+        opfrom = data['opfrom'] if data.has_key('opfrom') else ''
			
 
				+        opto = data['opto'] if data.has_key('opto') else ''
			
 
				+        officialWebsite = data['officialWebsite'] if data.has_key('officialWebsite') else ''
			
 
				+        regno = data['regno'] if data.has_key('regno') else ''
			
 
				+        legalperson = data['legalperson'] if data.has_key('legalperson') else ''
			
 
				+        address = data['address'] if data.has_key('address') else ''
			
 
				+        businessscope = data['businessscope'] if data.has_key('businessscope') else ''
			
 
				+        entstatus = data['entstatus'] if data.has_key('entstatus') else ''
			
 
				+        enttype = data['enttype'] if data.has_key('enttype') else ''
			
 
				+        industry = data['industry'] if data.has_key('industry') else ''
			
 
				+        regcap = data['regcap'] if data.has_key('regcap') else ''
			
 
				+
			
 
				+        opentime = self.format_time(opfrom) + '-' + self.format_time(opto)
			
 
				+        self.cursor.execute(
			
 
				+            """insert into company_info(company_name, reg_no, legal_person, reg_addr, scope, open_status, ent_type, industry, annual_date,start_date,open_time,reg_capital,website)
			
 
				+            value (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)""",
			
 
				+            (data['entname'], regno, legalperson, address, businessscope,
			
 
				+             entstatus, enttype, industry, annual_date, start_date, opentime, regcap,
			
 
				+             officialWebsite))
			
 
				+        # 提交sql语句
			
 
				+        self.connect.commit()
			
 
				+        company_id = self.cursor.lastrowid
			
 
				+
			
 
				+        time.sleep(0.5)
			
 
				+
			
 
				+        self.headers['Referer'] = 'https://biz.lixiaoskb.com/report/' + id + '?company=' + name + '&source=search'
			
 
				+        # 获取旗下网站信息
			
 
				+        yield scrapy.Request(
			
 
				+            'https://biz.lixiaoskb.com/api/opensearch/getDomains?id=' + id,
			
 
				+            headers=self.headers,
			
 
				+            meta={'company_id': company_id},
			
 
				+            callback=self.get_webs
			
 
				+        )
			
 
				+
			
 
				+        time.sleep(0.5)
			
 
				+        # 获取股东信息
			
 
				+        yield scrapy.Request(
			
 
				+            'https://biz.lixiaoskb.com/api/opensearch/querySpecifiedSiteInfo?id=' + id + '&tag=3',
			
 
				+            headers=self.headers,
			
 
				+            meta={'company_id': company_id},
			
 
				+            callback=self.get_shareholders
			
 
				+        )
			
 
				+
			
 
				+        time.sleep(0.5)
			
 
				+        # 获取招聘
			
 
				+        yield scrapy.Request(
			
 
				+            'https://biz.lixiaoskb.com/api/opensearch/querySpecifiedSiteInfo?id=' + id + '&tag=4',
			
 
				+            headers=self.headers,
			
 
				+            meta={'company_id': company_id, 'offset': 1, 'id': id, 'name': name},
			
 
				+            callback=self.get_jobs
			
 
				+        )
			
 
				+
			
 
				+        time.sleep(0.5)
			
 
				+        # 获取网络推广
			
 
				+        yield scrapy.Request(
			
 
				+            'https://biz.lixiaoskb.com/api/opensearch/querySpecifiedSiteInfo?id=' + id + '&tag=5',
			
 
				+            headers=self.headers,
			
 
				+            meta={'company_id': company_id, 'offset': 1, 'id': id, 'name': name},
			
 
				+            callback=self.get_sem
			
 
				+        )
			
 
				+
			
 
				+    # 获取网络推广
			
 
				+    def get_sem(self, response):
			
 
				+        body = json.loads(response.body)
			
 
				+        company_id = response.meta['company_id']
			
 
				+        offset = response.meta['offset']
			
 
				+        id = response.meta['id']
			
 
				+        name = response.meta['name']
			
 
				+        items = body['data']['item']
			
 
				+        if items:
			
 
				+            for item in items:
			
 
				+                keywords = ','.join(item['semKeywords'])
			
 
				+                self.cursor.execute(
			
 
				+                    """insert into company_sems(company_id, sem_date, sem_title, sem_url, keywords, source_name)
			
 
				+                    value (%s, %s, %s, %s, %s, %s)""",
			
 
				+                    (company_id, item['semDate'], item['semTitle'], item['semUrl'], keywords, item['sourceName'],
			
 
				+                     ))
			
 
				+                # 提交sql语句
			
 
				+                self.connect.commit()
			
 
				+
			
 
				+            time.sleep(1)
			
 
				+            offset = offset + 1
			
 
				+            if offset % 5 == 0:
			
 
				+                time.sleep(5)
			
 
				+
			
 
				+            self.headers['Referer'] = 'https://biz.lixiaoskb.com/report/' + id + '?company=' + name + '&source=search'
			
 
				+            yield scrapy.Request(
			
 
				+                'https://biz.lixiaoskb.com/api/opensearch/querySpecifiedSiteInfo?id=' + str(
			
 
				+                    id) + '&tag=5' + '&offset=' + str(offset),
			
 
				+                headers=self.headers,
			
 
				+                meta={'company_id': company_id, 'offset': offset, 'id': id, 'name':name},
			
 
				+                callback=self.get_sem
			
 
				+            )
			
 
				+
			
 
				+    # 获取旗下网站信息
			
 
				+    def get_webs(self, response):
			
 
				+        body = json.loads(response.body)
			
 
				+        company_id = response.meta['company_id']
			
 
				+        items = body['data']['items']
			
 
				+        if items:
			
 
				+            for item in items:
			
 
				+                self.cursor.execute(
			
 
				+                    """insert into company_domains(company_id, site_domain, site_home, site_name)
			
 
				+                    value (%s, %s, %s, %s)""",
			
 
				+                    (company_id, item['SITEDOMAIN'], item['SITEHOME'], item['SITENAME']))
			
 
				+                # 提交sql语句
			
 
				+                self.connect.commit()
			
 
				+
			
 
				+    # 获取股东信息
			
 
				+    def get_shareholders(self, response):
			
 
				+        body = json.loads(response.body)
			
 
				+        company_id = response.meta['company_id']
			
 
				+        if body['data'].has_key('BE_INVEST'):
			
 
				+            items = body['data']['BE_INVEST']['data']
			
 
				+            if items:
			
 
				+                for item in items:
			
 
				+                    self.cursor.execute(
			
 
				+                        """insert into company_shareholders(company_id, username, insto, amount, amount_paid)
			
 
				+                        value (%s, %s, %s, %s, %s)""",
			
 
				+                        (company_id, item['INV'], item['INSTO'], item['LIACCONAM'], item['LISUBCONAM']))
			
 
				+                    # 提交sql语句
			
 
				+                    self.connect.commit()
			
 
				+
			
 
				+    # 获取招聘
			
 
				+    def get_jobs(self, response):
			
 
				+        body = json.loads(response.body)
			
 
				+        company_id = response.meta['company_id']
			
 
				+        offset = response.meta['offset']
			
 
				+        id = response.meta['id']
			
 
				+        name = response.meta['name']
			
 
				+        items = body['data']['item']
			
 
				+        if items:
			
 
				+            for item in items:
			
 
				+                self.cursor.execute(
			
 
				+                    """insert into company_jobs(company_id, job_name, location, salary, job_url, release_date, source_name)
			
 
				+                    value (%s, %s, %s, %s, %s, %s, %s)""",
			
 
				+                    (company_id, item['jobName'], item['location'], item['salary'], item['jobUrl'], item['releaseDate'],
			
 
				+                     item['sourceName']))
			
 
				+                # 提交sql语句
			
 
				+                self.connect.commit()
			
 
				+
			
 
				+            time.sleep(1)
			
 
				+            offset = offset + 1
			
 
				+            if offset % 5 == 0:
			
 
				+                time.sleep(5)
			
 
				+
			
 
				+            self.headers['Referer'] = 'https://biz.lixiaoskb.com/report/' + id + '?company=' + name + '&source=search'
			
 
				+            yield scrapy.Request(
			
 
				+                'https://biz.lixiaoskb.com/api/opensearch/querySpecifiedSiteInfo?id=' + str(
			
 
				+                    id) + '&tag=4' + '&offset=' + str(offset),
			
 
				+                headers=self.headers,
			
 
				+                meta={'company_id': company_id, 'offset': offset, 'id': id, 'name': name},
			
 
				+                callback=self.get_jobs
			
 
				+            )
			
 
				+
			
 
				+    # 将时间格式化为时间戳
			
 
				+    def format_time(self, mytime):
			
 
				+        if mytime:
			
 
				+            timeStamp = float(mytime / 1000)
			
 
				+            timeArray = time.localtime(timeStamp)
			
 
				+            return time.strftime("%Y-%m-%d", timeArray)
			
 
				+        return ''
			
--- a/lixiao/spiders/lixiaosknb.pyc
+++ b/lixiao/spiders/lixiaosknb.pyc
--- a/scrapy.cfg
+++ b/scrapy.cfg
@@ -0,0 +1,11 @@
 
				+# Automatically created by: scrapy startproject
			
 
				+#
			
 
				+# For more information about the [deploy] section see:
			
 
				+# https://scrapyd.readthedocs.io/en/latest/deploy.html
			
 
				+
			
 
				+[settings]
			
 
				+default = lixiao.settings
			
 
				+
			
 
				+[deploy]
			
 
				+#url = http://localhost:6800/
			
 
				+project = lixiao