# import requests # from bs4 import BeautifulSoup # from selenium import webdriver # import time # import os # from datetime import datetime # import csv # from selenium.webdriver.common.keys import Keys # import random # from selenium.webdriver import ActionChains # # import json # import hashlib # import base64 # import hmac # from urllib.parse import quote_plus # import socket # import sys # from selenium.webdriver.chrome.options import Options # from selenium.webdriver.chrome.service import Service # from selenium.webdriver.common.desired_capabilities import DesiredCapabilities # from selenium.webdriver.common.proxy import Proxy, ProxyType # # 设置代理服务器 # # from selenium import webdriver # from selenium.webdriver.common.by import By # from selenium.webdriver.support.ui import WebDriverWait # from selenium.webdriver.support import expected_conditions as EC import time from varargs_kong import * sys.path.append(r"C:\多抓鱼\整合版本整合版本") ''' 更新记录:在1.1基础上新增利润和利润率过滤,符合要求的且没货的,且没加入书单的,点击加入书单 ''' #设置浏览器头 def create_proxyauth_extension(proxy_host, proxy_port, proxy_username, proxy_password, scheme='http', plugin_path=None): """Proxy Auth Extension args: proxy_host (str): domain or ip address, ie proxy.domain.com proxy_port (int): port proxy_username (str): auth username proxy_password (str): auth password kwargs: scheme (str): proxy scheme, default http plugin_path (str): absolute path of the extension return str -> plugin_path """ import string import zipfile if plugin_path is None: plugin_path = 'Selenium-Chrome-HTTP-Private-Proxy.zip' manifest_json = """ { "version": "1.0.0", "manifest_version": 2, "name": "Chrome Proxy", "permissions": [ "proxy", "tabs", "unlimitedStorage", "storage", "", "webRequest", "webRequestBlocking" ], "background": { "scripts": ["background.js"] }, "minimum_chrome_version":"22.0.0" } """ background_js = string.Template( """ var config = { mode: "fixed_servers", rules: { singleProxy: { scheme: "${scheme}", host: "${host}", port: parseInt(${port}) }, bypassList: ["foobar.com"] } }; chrome.proxy.settings.set({value: config, scope: "regular"}, function() {}); function callbackFn(details) { return { authCredentials: { username: "${username}", password: "${password}" } }; } chrome.webRequest.onAuthRequired.addListener( callbackFn, {urls: [""]}, ['blocking'] ); """ ).substitute( host=proxy_host, port=proxy_port, username=proxy_username, password=proxy_password, scheme=scheme, ) with zipfile.ZipFile(plugin_path, 'w') as zp: zp.writestr("manifest.json", manifest_json) zp.writestr("background.js", background_js) return plugin_path #设置浏览器头配置函数 def configure_headless_browser(proxy_config): chrome_options = Options() prefs = { 'profile.default_content_setting_values': { 'images': 2 } } chrome_options.add_experimental_option('prefs', prefs) chrome_options.add_argument('--disable-css-animation') chrome_options.add_argument('--disable-css-transitions') chrome_options.add_argument('--disable-javascript') chrome_options.add_argument("--start-maximized") proxyauth_plugin_path = create_proxyauth_extension( proxy_host=proxy_config[0], proxy_port=proxy_config[1], proxy_username=proxy_config[2], proxy_password=proxy_config[3] ) chrome_options.add_extension(proxyauth_plugin_path) # chrome_options.add_argument('--headless') # chrome_options.add_argument( # "user-agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'") # chrome_service = Service("./chromedriver.exe") return webdriver.Chrome(options=chrome_options) #设置浏览器 def open_chrome(): # proxy = requests.get( # "http://19157751061.user.xiecaiyun.com/api/proxies?action=getJSON&key=NP14F9A7FC&count=&word=&rand=false&norepeat=false&detail=false<ime=&idshow=false") # j = json.loads(proxy.text) # p = j["result"][0] # proxyip = p["ip"] # proxyprot = str(p["port"]) # # 设置代理IP # proxy_config = [proxyip, proxyprot, proxyusernm, proxypasswd] # # 启动浏览器 # browser = configure_headless_browser(proxy_config) # return browser #user_data_dir = r'--user-data-dir=C:\Users\ZhuanZ\Local\Google\Chrome\User Data3' # # # 加载配置数据 user_data_dir = r'--user-data-dir=C:\Users\S2020\AppData\Local\Google\Chrome\User Data3' option = webdriver.ChromeOptions() option.add_argument(user_data_dir) browser = webdriver.Chrome(chrome_options=option) # => 注意这里的参数 # browser = webdriver.Chrome() # => 注意这里的参数 # browser.get('https://www.duozhuayu.com/search/book') # browser.maximize_window() #browser = webdriver.Chrome() # => 注意这里的参数 return browser # #获取浏览器cookie # user_data_dir = r'--user-data-dir=C:\Users\S2020\AppData\Local\Google\Chrome\User Data' # # # # 加载配置数据 # option = webdriver.ChromeOptions() # option.add_argument(user_data_dir) # browser = webdriver.Chrome(chrome_options=option) # => 注意这里的参数 # # # # #browser = webdriver.Chrome() # => 注意这里的参数 # # return browser def get_now_date(): # 获取当前日期和时间 now = datetime.now() # 格式化日期和时间 formatted_date = now.strftime("%Y%m%d") formatted_time = now.strftime("%H:%M:%S") # 输出格式化后的日期和时间 #print("格式化后的日期:", formatted_date) return formatted_date def get_bak_data(data,month_num): year = data[0:4] month = data[4:6] date = data[6:8] # print(year,month,date) if int(month) > month_num: bakmonth = int(month) - month_num if len(str(bakmonth)) == 1: bakmonth = '0' + str(bakmonth) else: bakmonth = str(bakmonth) new_date = year + bakmonth + date else: bakmonth = 12 - month_num + int(month) bakyear = int(year) - 1 if len(str(bakmonth)) == 1: bakmonth = '0' + str(bakmonth) else: bakmonth = str(bakmonth) new_date = str(bakyear) + str(bakmonth) + str(date) return new_date #日期 # date = int(get_bak_data(get_now_date(),month_num)) # def get_now_time(): # # 获取当前日期和时间 # now = datetime.now() # # # 格式化日期和时间 # formatted_date = now.strftime("%Y%m%d") # formatted_time = now.strftime("%H:%M") # # # 输出格式化后的日期和时间 # print("格式化后的日期:", formatted_time) # return formatted_time def shop_url_comb(url_shop_id,url_page): ''' 获取获取店铺数据链接 :param url_shop_id: :param url_page: :return: ''' url_website = "http://shop.kongfz.com/" url_shop_id = str(url_shop_id) url_term = "/all/0_50_0_0_" url_page = url_page "http://shop.kongfz.com/472611/all/0_50_0_0_1_sort_desc_10_0/" #下面的30-200是价格过滤 url_suffix = "_sort_desc_" + str(mix_price) + "_0/" url = url_website + url_shop_id + url_term + str(url_page) + url_suffix return url def get_book_name_isbn(url,url_shop_id): ''' 获取书籍信息和上书时间 :param url: :param url_shop_id: :return: ''' requests.adapters.DEFAULT_RETRIES = 5 s = requests.session() s.keep_alive = False ret = requests.get(url, headers=headers) ret.encoding = ret.apparent_encoding soup = BeautifulSoup(ret.text, 'html.parser') book_names = soup.findAll(name='a',attrs={"class":"row-name"}) #获取书名 book_content = [] book_url = 'https://book.kongfz.com/' onload_book_final = 0 for i in book_names: itemid = i['href'].replace(book_url+url_shop_id, '').replace('/', '') isbn = soup.find(name='div',attrs={"itemid":itemid})['isbn'] # price = soup.find(name='div',attrs={"itemid":itemid})['price'] # price = price[price.find("¥")+1:] # price = price[:price.find("\n")] book_content.append(isbn) onload_time = soup.findAll(name='div',attrs={"class":"add-time-box"}) for i in onload_time: onload_book_final = i.text onload_book_final = onload_book_final.replace("上书","") onload_book_final = onload_book_final.replace("-","") print("上书日期:"+onload_book_final) return book_content,onload_book_final def write_content_to_txt(contents,url): ''' 获取到的isbn写入表格 :param contents: :param url: :return: ''' url_csv = url + '.csv' with open(url_csv, 'a') as f: try: for i in contents[0]: print("contents:"+i) if len(i) == 13: # if len(i[0]) == 13 and i[1] > mix_number: f.write(i) # f.write(',') # f.write(str(i[1])) # f.write(',') # f.write(str(i[2])) # f.write(',') # f.write(str(i[3])) f.write('\n') except: print('continue') f.close() def read_scv(path): ''' 获取表格文件内容 :param path: :return: ''' con = [] with open(path, 'r') as f: content = csv.reader(f) for i in content: con.append(i) f.close() return con def deal_shop_id(shop_list1,shop_list2): ''' 返回新的表格数据,更新表格 :param shop_list1:已采集过的数据 :param shop_list2:老的数据 :return: ''' new_list = [] flag = 0 for i in shop_list2: for j in shop_list1: if i[0] == j[0]: flag = 1 if flag == 0: new_list.append(i) else: flag = 0 for j in shop_list1: new_list.append(j) return new_list def rm_shop_csv(path): ''' 获取删除文件 :param path: :return: ''' os.remove(path) def write_new_csv(path,data): ''' 将店铺数据写入表格 :param path: :param data: :return: ''' f = open(path, 'a') for i in data: f.write(i[0]) f.write(',') f.write(i[1]) f.write(',') f.write(i[2]) f.write('\n') # def get_book_msg(url,browser): # ''' # 获取书籍信息,名称,出版社等 # :param url: # :param browser: # :return: # ''' # try: # browser.switch_to.window(Handles2[0]) # # print("切换后的新窗口是:" + browser.title) # time.sleep(1) # browser.get(url) # browser.implicitly_wait(5) # browser.execute_script("window.stop()") # time.sleep(2) # except: # time.sleep(2) # return 0,'0',('0',"暂无出版社信息",'','0.00') # if browser.find_element_by_class_name("forbidden-con-box"): # print("出发告警") # return "forbidden" # # # global first_flag # if first_flag == 0: # first_flag = 1 # time.sleep(10) # # try: # result = browser.find_element_by_class_name("result-list") # except: # print("暂无出版社信息") # return 0,'0',('0',"暂无出版社信息",'','0.00') # #img = browser.find_element_by_id("mainInmg") # # book_name = result.find_element_by_class_name("title").text # book_url = result.find_element_by_class_name("title").find_element_by_class_name("link").get_attribute("href") # # for i in '\/:*?”<>|"': # book_name = book_name.replace(i,"") # book_name = book_name.replace(" ", "") # book_content = result.find_element_by_class_name( "zl-isbn-info").text # # book_content = book_content[book_content.find('/')+1:].replace(" ","") # # print(book_content) # book_press = book_content[:book_content.find('/')] # book_content = book_content[book_content.find('/') + 1:].replace(" ","") # book_press_time = book_content[:book_content.find('/')] # sale_price = book_content[book_content.rfind("¥")+1:].replace(" ","") # # print(book_press,book_press_time,sale_price) # # return book_name,book_press,book_press_time,sale_price,book_url def get_book_sale(browser,url): try: browser.get(url) element = WebDriverWait(browser, 10).until( EC.presence_of_element_located((By.CLASS_NAME, 'sold-time')) ) content = browser.find_elements_by_class_name("sold-time") num = 0 # price_total = [] for index, i in enumerate(content): # print(i.text) x = i.text.replace('-', '').replace(' 已售', '') if int(x) >= int(get_bak_data(get_now_date(),month_num)): num = num + 1 except: num = 0 print(num) return num def get_book_sale_price(url,browser): ''' 获取价格 :param url: :param browser: :param book_title: :return: ''' try: browser.get(url) # time.sleep(6) element = WebDriverWait(browser, 10).until( EC.presence_of_element_located((By.CLASS_NAME, 'product-item-box')) ) result_list = browser.find_element_by_class_name("product-item-box") price = float(result_list.find_element_by_class_name("price-info").text.replace("¥", "")) price2 = float(result_list.find_elements_by_class_name("price-info")[1].text.replace("¥", "")) price3 = float(result_list.find_elements_by_class_name("price-info")[2].text.replace("¥", "")) try: kuaidi = result_list.find_element_by_class_name("ship-fee-box").text if "包邮" not in kuaidi: kuaidi = float(kuaidi[kuaidi.index("¥") + 1:kuaidi.index(".") + 2]) else: kuaidi = 0 except: kuaidi = kuaidi_price try: kuaidi2 = result_list.find_elements_by_class_name("ship-fee-box")[1].text if "包邮" not in kuaidi2: kuaidi2 = float(kuaidi2[kuaidi2.index("¥") + 1:kuaidi2.index(".") + 2]) else: kuaidi2 = 0 except: kuaidi2 = kuaidi_price try: kuaidi3 = result_list.find_elements_by_class_name("ship-fee-box")[2].text if "包邮" not in kuaidi3: kuaidi3 = float(kuaidi3[kuaidi3.index("¥") + 1:kuaidi3.index(".") + 2]) else: kuaidi3 = 0 except: kuaidi3 = kuaidi_price price_final = 0 price_final1 = price + kuaidi price_final2 = price2 + kuaidi2 price_final3 = price3 + kuaidi3 if price_final2 - price_final1 > 5: price_final = price_final2 elif price_final3 - price_final2 > 5: price_final = price_final3 else: price_final = price_final1 print(price_final1, price_final2, price_final3) book_name = result_list.find_element_by_class_name("item-name").text book_press = result_list.find_elements_by_class_name("zl-info")[1].text book_press_time = result_list.find_elements_by_class_name("zl-info")[2].text return price_final except: return 0 index_404 = 0 sale_num_404 = 0 def get_book_content(book_con,browser): ''' 第二步,获取每个书籍的图片,作者,出版社,价格等信息 :param url: :return: ''' # url = "https://search.kongfz.com/item_result/?status=0&key="+book_con if len(book_con) == 13: # book_name = get_book_msg(url,browser) # if book_name == "forbidden": # return "forbidden" try: url1 = 'https://search.kongfz.com/product/?keyword=' url2 = '&quality=90~&quaSelect=2&page=1&actionPath=quality,sortType&sortType=7' url_t = url1 + str(book_con) + url2 book_sale_price = get_book_sale_price(url_t, browser) except: book_sale_price = 0 global index_404,sale_num_404 if book_sale_price == 0: index_404 = index_404 + 1 else: index_404 = 0 print("------------------------------") print("book_sale_price:"+str(book_sale_price)) # print(type(book_sale_price)) if book_sale_price < book_mix_price and book_sale_price >= 0: time.sleep(5) return False time.sleep(3) #获取已售数据 sale_url1 = 'https://search.kongfz.com/product/?dataType=1&keyword=' sale_url2 = '&sortType=10&page=1&actionPath=sortType' sale_url = sale_url1 + str(book_con) + sale_url2 num = get_book_sale(browser,sale_url) print("sale_num:"+str(num)) time.sleep(2) price_url1 = 'https://search.kongfz.com/product/?keyword=' price_url2 = '&quality=90~&quaSelect=2&page=1&actionPath=quality,sortType&sortType=7' price_url = price_url1 + str(book_con) + price_url2 if num == 0: sale_num_404 = sale_num_404 + 1 else: sale_num_404 = 0 #小谷吖数据 if num >= mix_number: f = open("销量达标.csv", 'a') f.write(price_url) f.write(',') f.write(book_con) f.write(',') f.write(str(book_sale_price)) f.write(',') f.write(str(num)) f.write('\n') else: return False else: return False def open_page(): ''' 启动浏览器的基础工作,登录 :return: ''' browser = open_chrome() #browser = webdriver.Chrome() try: browser.get("https://www.kongfz.com/") time.sleep(5) action = ActionChains(browser) logon = browser.find_elements_by_class_name("item-info")[0] action.move_to_element(logon).perform() time.sleep(2) browser.find_element_by_class_name("login-btn").click() time.sleep(3) browser.switch_to.frame(browser.find_element_by_id("iframe_login")) time.sleep(1) global name_index if name_index == len(name_msg)-1: name_index = 0 browser.find_element_by_id("username").send_keys(name_msg[0][0]) time.sleep(0.5) browser.find_element_by_id("password").send_keys(name_msg[0][1]) name_index = name_index + 1 time.sleep(1) browser.find_element_by_class_name("autoLogin").click() time.sleep(1) browser.find_element_by_class_name("login_submit").click() time.sleep(2) browser.switch_to.default_content() except: print("") time.sleep(10) return browser def get_shop_isbn(url,url_shop_id): ''' 获取店铺isbn :param url: :param url_shop_id: :return: ''' ret = requests.get(url, headers=headers) ret.encoding = ret.apparent_encoding soup = BeautifulSoup(ret.text, 'html.parser') book_names = soup.findAll(name='a',attrs={"class":"row-name"}) #获取书名 book_content = [] book_url = 'https://book.kongfz.com/' for i in book_names: itemid = i['href'].replace(book_url+url_shop_id, '').replace('/', '') isbn = soup.find(name='div',attrs={"itemid":itemid})['isbn'] # price = soup.find(name='div',attrs={"itemid":itemid})['price'] # price = price[price.find("¥")+1:] # price = price[:price.find("\n")] book_content.append(isbn) return book_content def get_isbn_url(browser,sale_url): ''' 获取指定isbn的已售链接 :return: ''' shop_list = [] browser.get(sale_url) element = WebDriverWait(browser, 10).until( EC.presence_of_element_located((By.CLASS_NAME, 'item-link')) ) a = browser.find_elements_by_class_name("item-link") s = 0 for i in a: if "https://book.kongfz.com/0/" in i.get_attribute("href"): print(i.get_attribute("href")) shop_list.append(i.get_attribute("href")) s = s + 1 if s == 50: break return shop_list def get_sale_shop_url(url): """ 根据已售数据获取店铺id :param url: :return: """ shop_id = "" ret = requests.get(url,headers = headers) ret.encoding = ret.apparent_encoding soup = BeautifulSoup(ret.text, 'html.parser') book_head = soup.find(name='head') # 获取书名 book_metas = book_head.findAll(name='meta') book_meta = book_metas[1].findAll(name='meta') for i in book_meta: i = str(i) if "url=https://shop.kongfz.com/" in i: i = i[i.index("url=https://shop.kongfz.com/")+28:] i = i[:i.index("/")] shop_id = i break return shop_id # print(i) def get_shop_msg(browser,url): ''' 获取店铺信息 :param browser: :param url: :return: ''' browser.get(url) element = WebDriverWait(browser, 10).until( EC.presence_of_element_located((By.CLASS_NAME, 'sale-count')) ) sale_num = browser.find_element_by_class_name("sale-count").text sale_num = sale_num.replace("(","") sale_num = sale_num.replace(")","") sale_num = sale_num.replace("笔","") print(sale_num) url1 = url + "/all/" browser.get(url1) element = WebDriverWait(browser, 10).until( EC.presence_of_element_located((By.CLASS_NAME, 'crumbs-nav-start')) ) tag = browser.find_elements_by_class_name("crumbs-nav-start")[-1].text shop_num = tag.replace("条结果","").replace(" ","") print(shop_num) print("\n") return sale_num,shop_num # date_6 = int(get_bak_data(get_now_date(),6)) if __name__ == "__main__": #更新数据 browser = open_page() shop_data = read_scv(shop_path) shop_new_data = [] isbn_index = 0 current_timestamp1 = int(time.time()) for shop in shop_data: for i in range(1,6): try: isbns = get_shop_isbn(shop_url_comb(shop[0],i),shop[0]) isbn_index = 0 except: isbn_index = isbn_index + 1 if isbn_index >2: isbn_index = 0 print("重新打开网页避免网页崩溃。") # browser.quit() time.sleep(5) # browser = open_page() print(isbns) for isbn in isbns: time.sleep(3) sale_lists = [] if isbn == "": continue sale_url = "https://search.kongfz.com/product/?dataType=1&keyword=" + str( isbn) + "&sortType=10&page=1&actionPath=sortType" try: sale_lists = get_isbn_url(browser, sale_url) isbn_index = 0 except: isbn_index = isbn_index + 1 if isbn_index > 2: isbn_index = 0 print("重新打开网页避免网页崩溃。") # browser.quit() time.sleep(5) # browser = open_page() # print(sale_lists) shop_ids = [] if sale_lists == []: continue for shop_list in sale_lists: try: shop_id = get_sale_shop_url(shop_list) except: continue print(shop_id) url = "http://shop.kongfz.com/" + str(shop_id) try: sale_num, shop_num = get_shop_msg(browser, url) except: continue if int(shop_num) >=100 and int(shop_num) <=shop_book_num: print(shop_new_data) if int(sale_num)/int(shop_num) >= shop_sale_num: f = open("test.txt","a") f.write(str(shop_id)) f.write('\n') f.close() shop_new_data= [[shop_id, str(int(get_bak_data(get_now_date(),6))), '8'],] shiyong_shop_data = [] for shop in shop_new_data: next = 0 s = 0 while 1: s = s + 1 if s > 20: break try: contents = get_book_name_isbn(shop_url_comb(str(shop[0]), s), str(shop[0])) write_content_to_txt(contents, str(shop[0])) print(shop[0], '第' + str(s) + '页数据已采集完成') except: print("next") next = next + 1 contents = [0, 0] # time.sleep(0.5) if next > 10: next = 0 break if int(contents[1]) <= int(shop[1]): break shiyong_shop_data.append([shop[0], get_now_date(), shop[2]]) try: book_isbns = read_scv(str(shop[0]) + '.csv') rm_shop_csv(shop[0] + '.csv') except: book_isbns = [] # 读完一条数据直接删除加写入 data = deal_shop_id(shiyong_shop_data, shop_data) print(data) f = open(shop_path, "w") f.close() write_new_csv(shop_path, data) shiyong_shop_data = [] sousuo_flag = 0 sleep_time = 0 index_sousuo = 0 for book_isbn in book_isbns: # current_timestamp2 = int(time.time()) # if current_timestamp2 - current_timestamp1 >= 3600: # current_timestamp1 = current_timestamp2 time.sleep(random.randint(4, 6)) if index_404 > 10: # time.sleep(60) index_404 = 0 sale_num_404 = 0 index_sousuo = 0 time.sleep(600) browser.quit() time.sleep(6) browser = open_page() if sale_num_404 > 10: # time.sleep(60) index_404 = 0 sale_num_404 = 0 index_sousuo = 0 time.sleep(600) browser.quit() time.sleep(6) browser = open_page() sousuo_flag = sousuo_flag + 1 print("isbn" + book_isbn[0]) if len(book_isbn[0]) != 13: continue try: result = get_book_content(book_isbn[0], browser) index_sousuo = index_sousuo + 1 if index_sousuo %100 == 0: time.sleep(random.randint(60, 120)) f = open("sousuo.txt","a",encoding="utf-8") f.write(str(datetime.now()) +": " + str(index_sousuo)) f.write("\n") f.close() print("采集数量:"+ str(index_sousuo)) if result == "forbidden": print("重新打开网页避免网页崩溃。") # browser.quit() time.sleep(30) # browser = open_page() # sleep_time = sleep_time + 1 except: print("重新打开网页避免网页崩溃。") # browser.quit() # time.sleep(30) # browser = open_page() continue # print ("多抓鱼对比通过数量:"+str(duozhuayu_flag),"通过isbn搜索数量:"+str(isbn_num)) # if sousuo_flag % 100 == 0: # print("重新打开网页避免网页崩溃。") # browser.quit() # time.sleep(120) # browser = open_page()