from varargs_duozhuayu import * sys.path.append(r"C:\多抓鱼\整合版本整合版本") ''' 更新记录:在1.1基础上新增利润和利润率过滤,符合要求的且没货的,且没加入书单的,点击加入书单 ''' #设置浏览器头 def create_proxyauth_extension(proxy_host, proxy_port, proxy_username, proxy_password, scheme='http', plugin_path=None): """Proxy Auth Extension args: proxy_host (str): domain or ip address, ie proxy.domain.com proxy_port (int): port proxy_username (str): auth username proxy_password (str): auth password kwargs: scheme (str): proxy scheme, default http plugin_path (str): absolute path of the extension return str -> plugin_path """ import string import zipfile if plugin_path is None: plugin_path = 'Selenium-Chrome-HTTP-Private-Proxy.zip' manifest_json = """ { "version": "1.0.0", "manifest_version": 2, "name": "Chrome Proxy", "permissions": [ "proxy", "tabs", "unlimitedStorage", "storage", "", "webRequest", "webRequestBlocking" ], "background": { "scripts": ["background.js"] }, "minimum_chrome_version":"22.0.0" } """ background_js = string.Template( """ var config = { mode: "fixed_servers", rules: { singleProxy: { scheme: "${scheme}", host: "${host}", port: parseInt(${port}) }, bypassList: ["foobar.com"] } }; chrome.proxy.settings.set({value: config, scope: "regular"}, function() {}); function callbackFn(details) { return { authCredentials: { username: "${username}", password: "${password}" } }; } chrome.webRequest.onAuthRequired.addListener( callbackFn, {urls: [""]}, ['blocking'] ); """ ).substitute( host=proxy_host, port=proxy_port, username=proxy_username, password=proxy_password, scheme=scheme, ) with zipfile.ZipFile(plugin_path, 'w') as zp: zp.writestr("manifest.json", manifest_json) zp.writestr("background.js", background_js) return plugin_path #设置浏览器头配置函数 def configure_headless_browser(proxy_config): chrome_options = Options() prefs = { 'profile.default_content_setting_values': { 'images': 2 } } chrome_options.add_experimental_option('prefs', prefs) chrome_options.add_argument('--disable-css-animation') chrome_options.add_argument('--disable-css-transitions') chrome_options.add_argument('--disable-javascript') chrome_options.add_argument("--start-maximized") proxyauth_plugin_path = create_proxyauth_extension( proxy_host=proxy_config[0], proxy_port=proxy_config[1], proxy_username=proxy_config[2], proxy_password=proxy_config[3] ) chrome_options.add_extension(proxyauth_plugin_path) # chrome_options.add_argument('--headless') # chrome_options.add_argument( # "user-agent='Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.69 Safari/537.36'") # chrome_service = Service("./chromedriver.exe") return webdriver.Chrome(options=chrome_options) #设置浏览器 def open_chrome(): # proxy = requests.get( # "http://19157751061.user.xiecaiyun.com/api/proxies?action=getJSON&key=NP14F9A7FC&count=&word=&rand=false&norepeat=false&detail=false<ime=&idshow=false") # j = json.loads(proxy.text) # p = j["result"][0] # proxyip = p["ip"] # proxyprot = str(p["port"]) # # 设置代理IP # proxy_config = [proxyip, proxyprot, proxyusernm, proxypasswd] # # 启动浏览器 # browser = configure_headless_browser(proxy_config) # return browser browser = webdriver.Chrome() # => 注意这里的参数 return browser # # def open_chrome(): # #无头模式 # # ch_options = Options() # # ch_options.add_argument("--headless") # => 为Chrome配置无头模式 # # # 在启动浏览器时加入配置 # # browser = webdriver.Chrome(chrome_options=ch_options) # => 注意这里的参数 # # # #获取浏览器cookie # user_data_dir = r'--user-data-dir=C:\Users\S2020\AppData\Local\Google\Chrome\User Data' # # # # 加载配置数据 # option = webdriver.ChromeOptions() # option.add_argument(user_data_dir) # browser = webdriver.Chrome(chrome_options=option) # => 注意这里的参数 # # # # #browser = webdriver.Chrome() # => 注意这里的参数 # # return browser def open_chrome2(): #无头模式 # ch_options = Options() # ch_options.add_argument("--headless") # => 为Chrome配置无头模式 # # 在启动浏览器时加入配置 # browser = webdriver.Chrome(chrome_options=ch_options) # => 注意这里的参数 #获取浏览器cookie user_data_dir = r'--user-data-dir=C:\Users\S2020\AppData\Local\Google\Chrome\User Data' # # # 加载配置数据 option = webdriver.ChromeOptions() option.add_argument(user_data_dir) browser = webdriver.Chrome(chrome_options=option) # => 注意这里的参数 # browser = webdriver.Chrome() # => 注意这里的参数 return browser def get_now_date(): # 获取当前日期和时间 now = datetime.datetime.now() # 格式化日期和时间 formatted_date = now.strftime("%Y%m%d") formatted_time = now.strftime("%H:%M:%S") # 输出格式化后的日期和时间 print("格式化后的日期:", formatted_date) return formatted_date def get_bak_data(data,month_num): year = data[0:4] month = data[4:6] date = data[6:8] # print(year,month,date) if int(month) > month_num: bakmonth = int(month) - month_num if len(str(bakmonth)) == 1: bakmonth = '0' + str(bakmonth) else: bakmonth = str(bakmonth) new_date = year + bakmonth + date else: bakmonth = 12 - month_num + int(month) bakyear = int(year) - 1 if len(str(bakmonth)) == 1: bakmonth = '0' + str(bakmonth) else: bakmonth = str(bakmonth) new_date = str(bakyear) + str(bakmonth) + str(date) return new_date #日期 date = int(get_bak_data(get_now_date(),month_num)) def get_now_time(): # 获取当前日期和时间 now = datetime.datetime.now() # 格式化日期和时间 formatted_date = now.strftime("%Y%m%d") formatted_time = now.strftime("%H:%M") # 输出格式化后的日期和时间 print("格式化后的日期:", formatted_time) return formatted_time def shop_url_comb(url_shop_id,url_page): url_website = "http://shop.kongfz.com/" url_shop_id = str(url_shop_id) url_term = "/all/0_50_0_0_" url_page = url_page "http://shop.kongfz.com/472611/all/0_50_0_0_1_sort_desc_10_0/" #下面的30-200是价格过滤 url_suffix = "_sort_desc_" + str(mix_price) + "_0/" url = url_website + url_shop_id + url_term + str(url_page) + url_suffix return url def get_book_name_isbn(url,url_shop_id): ret = requests.get(url, headers=headers) ret.encoding = ret.apparent_encoding soup = BeautifulSoup(ret.text, 'html.parser') book_names = soup.findAll(name='a',attrs={"class":"row-name"}) #获取书名 book_content = [] book_url = 'https://book.kongfz.com/' onload_book_final = 0 for i in book_names: itemid = i['href'].replace(book_url+url_shop_id, '').replace('/', '') isbn = soup.find(name='div',attrs={"itemid":itemid})['isbn'] # price = soup.find(name='div',attrs={"itemid":itemid})['price'] # price = price[price.find("¥")+1:] # price = price[:price.find("\n")] book_content.append(isbn) onload_time = soup.findAll(name='div',attrs={"class":"add-time-box"}) for i in onload_time: onload_book_final = i.text onload_book_final = onload_book_final.replace("上书","") onload_book_final = onload_book_final.replace("-","") print(onload_book_final) return book_content,onload_book_final def write_content_to_txt(contents,url): url_csv = url + '.csv' with open(url_csv, 'a') as f: try: for i in contents[0]: print(i) if len(i) == 13: # if len(i[0]) == 13 and i[1] > mix_number: f.write(i) # f.write(',') # f.write(str(i[1])) # f.write(',') # f.write(str(i[2])) # f.write(',') # f.write(str(i[3])) f.write('\n') except: print('continue') f.close() def read_scv(path): con = [] with open(path, 'r') as f: content = csv.reader(f) for i in content: con.append(i) f.close() return con def deal_shop_id(shop_list1,shop_list2): ''' 返回新的表格数据 :param shop_list1:已采集过的数据 :param shop_list2:老的数据 :return: ''' new_list = [] flag = 0 for i in shop_list2: for j in shop_list1: if i[0] == j[0]: flag = 1 if flag == 0: new_list.append(i) else: flag = 0 for j in shop_list1: new_list.append(j) if len(new_list) >= 15: return new_list[len(new_list)-15:] else: return new_list def rm_shop_csv(path): os.remove(path) def write_new_csv(path,data): ''' 将店铺数据写入表格 :param path: :param data: :return: ''' f = open(path, 'a') for i in data: f.write(i[0]) f.write(',') f.write(i[1]) f.write(',') f.write(i[2]) f.write('\n') def get_book_msg(url,browser): ''' 获取孔网上书籍信息 :param url: :param browser: :return: ''' # try: # # browser.switch_to.window(Handles2[0]) # # print("切换后的新窗口是:" + browser.title) # # time.sleep(1) # # browser.implicitly_wait(5) # # browser.execute_script("window.stop()") # time.sleep(2) # except: # time.sleep(2) # print("获取数据失败") # return 0,'0',('0',"暂无出版社信息",'','0.00') # # # # global first_flag # # if first_flag == 0: # # first_flag = 1 # # time.sleep(10) try: browser.get(url) result = WebDriverWait(browser, 10).until( EC.presence_of_element_located((By.CLASS_NAME, 'result-list')) ) #result = browser.find_element_by_class_name("result-list") except: return 0,'0',('0',"暂无出版社信息",'','0.00') book_name = result.find_element_by_class_name("title").text book_url = result.find_element_by_class_name("title").find_element_by_class_name("link").get_attribute("href") for i in '\/:*?”<>|"': book_name = book_name.replace(i,"") book_name = book_name.replace(" ", "") book_content = result.find_element_by_class_name( "zl-isbn-info").text book_content = book_content[book_content.find('/')+1:].replace(" ","") # print(book_content) book_press = book_content[:book_content.find('/')] book_content = book_content[book_content.find('/') + 1:].replace(" ","") book_press_time = book_content[:book_content.find('/')] sale_price = book_content[book_content.rfind("¥")+1:].replace(" ","") return book_name,book_press,book_press_time,sale_price,book_url def get_book_sale(browser,url): try: browser.get(url) browser.set_page_load_timeout(5) time.sleep(2) except: time.sleep(30) return 0 try: browser.implicitly_wait(5) # 设置超时时间 content = browser.find_elements_by_class_name("sold-time") # price = browser.find_elements_by_class_name("price") num = 0 year_num = 0 # price_total = [] for index,i in enumerate(content): # print(i.text) x = i.text.replace('-','').replace(' 已售','') if int(x) >= date: num = num + 1 except: num = 0 year_num = 0 return num def check_msg(browser,book_name,num,book_sale_price,sale_num,book_isbn = ''): try: name_book = browser.find_element_by_tag_name("h1").text browser.find_element_by_xpath("//*[contains(text(),'暂时无货')]") try: browser.find_element_by_xpath("//*[contains(text(),'上次到货')]") except: try: browser.find_element_by_xpath("//*[contains(text(),'预计最近会到货')]") except: f = open("从未到货.txt","a") f.write(name_book) f.write("\n") f.close() time.sleep(2) browser.back() time.sleep(1) return ('999', num, 0, 0, 0) except: name_book = '' print(1) try: price = browser.find_element_by_class_name("Price").text # print(price) price = price[price.index("¥") + 1:price.index(".") + 2] msg = browser.find_elements_by_class_name("info-row") sale_price, chubanshe, date, zuozhe = '0', '0', '0','0' # for i in msg: if "原价" in i.text: sale_price = i.text.replace("原价\n¥", "") print(sale_price) if "出版社" in i.text: chubanshe = i.text.replace("出版社\n", "") print(chubanshe) if "出版\n" in i.text: date = i.text.replace("出版\n", "") print(date) if "作者\n" in i.text: zuozhe = i.text.replace("作者\n", "") print(zuozhe) except: time.sleep(2) browser.back() time.sleep(1) return ('999', num,0,0,0) if chubanshe == book_name[1] and date == book_name[2]: print(11111111111111111111) book_flag = browser.find_element_by_class_name("Button-inner") if book_flag.text == '多个品相可选': price = browser.find_element_by_class_name("Price--padStart").text price = price[price.index("¥") + 1:price.index(".") + 2] pinxiang = '' if book_flag.text == '加入购物车': pinxiang = browser.find_element_by_class_name("book-quality").text lirun = float(float(book_sale_price) * 0.95 - float(price) - price_kuaidi) lirunlv = float(lirun) / float(price) url = 'https://search.kongfz.com/product_result/?key=' + book_isbn + '&itemfilter=0&status=0&hasStock=1&order=100&ajaxdata=1&quality=90h&quaselect=2&contentname=content' if book_flag.text == '多个品相可选' or book_flag.text == '加入购物车': f = open('多抓鱼有货.csv', 'a') # url = 'https://search.kongfz.com/product_result/?key='+book_isbn+'&itemfilter=0&status=0&hasStock=1&order=100&ajaxdata=1&quality=90h&quaselect=2&contentname=content' f.write(url) f.write(',') f.write(book_isbn) f.write(',') f.write(str(book_sale_price)) f.write(',') f.write(str(price)) f.write(',') f.write(str(sale_num)) f.write(',') f.write(str(pinxiang)) f.write(',') f.write('\n') if lirun >= 5 and lirunlv >= 0.25 and book_flag.text == '到货提醒': f = open('多抓鱼数据存档.csv', 'a') f.write(url) f.write(',') f.write(book_isbn) f.write(',') f.write(str(book_sale_price)) f.write(',') f.write(str(price)) f.write(',') f.write(str(sale_num)) f.write(',') f.write(sale_price) f.write(',') f.write(chubanshe) f.write(',') f.write(date.replace("-","")) f.write(',') f.write(zuozhe) f.write(',') f.write(name_book) f.write(',') f.write('\n') f.close() if float(lirun) < 10 or float(lirunlv) < 0.5: time.sleep(2) browser.back() time.sleep(1) return (price, num, 0, 0, 0) # 判断书籍是否适合购买 if int(sale_num) >= 20 and book_flag.text == '到货提醒': book_flag.click() if int(sale_num) < 20 and int(sale_num) >= 15 and book_flag.text == '到货提醒' and lirunlv >= 0.5: book_flag.click() if int(sale_num) < 15 and int(sale_num) >= 10 and book_flag.text == '到货提醒' and lirunlv >= 0.8: book_flag.click() if int(sale_num) < 10 and int(sale_num) >= 6 and book_flag.text == '到货提醒' and lirunlv >= 1: book_flag.click() if int(sale_num) < 6 and int(sale_num) >= 3 and book_flag.text == '到货提醒' and lirunlv >= 2.5: book_flag.click() time.sleep(2) browser.back() time.sleep(1) return (price, num, lirun, lirunlv, 1) else: print(book_name) print(chubanshe,date,sale_price) time.sleep(2) browser.back() time.sleep(1) print("failed") return ('999', num,0,0,0) def check_isbn(browser,num,book_sale_price,sale_num,book_isbn=''): try: name_book = browser.find_element_by_tag_name("h1").text browser.find_element_by_xpath("//*[contains(text(),'暂时无货')]") try: browser.find_element_by_xpath("//*[contains(text(),'上次到货')]") except: try: browser.find_element_by_xpath("//*[contains(text(),'预计最近会到货')]") except: f = open("从未到货.txt", "a") f.write(name_book) f.write("\n") f.close() time.sleep(2) browser.back() time.sleep(1) return ('999', num, 0, 0, 0) except: name_book = '' print(1) try: price = browser.find_element_by_class_name("Price").text price = price[price.index("¥") + 1:price.index(".") + 2] book_flag = browser.find_element_by_class_name("Button-inner") msg = browser.find_elements_by_class_name("info-row") sale_price, chubanshe, date, zuozhe = '0', '0', '0', '0' # for i in msg: if "原价" in i.text: sale_price = i.text.replace("原价\n¥", "") print(sale_price) if "出版社" in i.text: chubanshe = i.text.replace("出版社\n", "") print(chubanshe) if "出版\n" in i.text: date = i.text.replace("出版\n", "") print(date) if "作者\n" in i.text: zuozhe = i.text.replace("作者\n", "") print(zuozhe) if book_flag.text == '多个品相可选': price = browser.find_element_by_class_name("Price--padStart").text price = price[price.index("¥") + 1:price.index(".") + 2] pinxiang = '' if book_flag.text == '加入购物车': pinxiang = browser.find_element_by_class_name("book-quality").text lirun = float(float(book_sale_price) * 0.95 - float(price) - price_kuaidi) lirunlv = float(lirun) / float(price) url = 'https://search.kongfz.com/product_result/?key=' + book_isbn + '&itemfilter=0&status=0&hasStock=1&order=100&ajaxdata=1&quality=90h&quaselect=2&contentname=content' if book_flag.text == '多个品相可选' or book_flag.text == '加入购物车': f = open('多抓鱼有货.csv', 'a') f.write(url) f.write(',') f.write(book_isbn) f.write(',') f.write(str(book_sale_price)) f.write(',') f.write(str(price)) f.write(',') f.write(str(sale_num)) f.write(',') f.write(str(pinxiang)) f.write(',') f.write('\n') if lirun >= 5 and lirunlv >= 0.25 and book_flag.text == '到货提醒': f = open('多抓鱼数据存档.csv', 'a') f.write(url) f.write(',') f.write(book_isbn) f.write(',') f.write(str(book_sale_price)) f.write(',') f.write(str(price)) f.write(',') f.write(str(sale_num)) f.write(',') f.write(sale_price) f.write(',') f.write(chubanshe) f.write(',') f.write(date.replace("-","")) f.write(',') f.write(zuozhe) f.write(',') f.write(name_book) f.write(',') f.write('\n') f.close() if float(lirun)<10 or float(lirunlv) < 0.35: time.sleep(2) browser.back() time.sleep(1) return (price, num, 0,0,0) #判断书籍是否适合购买 if int(sale_num) >= 20 and book_flag.text == '到货提醒': book_flag.click() if int(sale_num) < 20 and int(sale_num) >= 15 and book_flag.text == '到货提醒' and lirunlv >= 0.5: book_flag.click() if int(sale_num) < 15 and int(sale_num) >= 10 and book_flag.text == '到货提醒' and lirunlv >= 0.8: book_flag.click() if int(sale_num) < 10 and int(sale_num) >= 6 and book_flag.text == '到货提醒' and lirunlv >= 1: book_flag.click() time.sleep(2) browser.back() time.sleep(1) return (price, num,lirun,lirunlv,1) except: time.sleep(2) browser.back() time.sleep(1) return ('999', num,0,0,0) #对比多抓鱼数据 def open_url(driver,book_isbn,book_name,book_sale_price,sale_num): ''' 0代表有货 1代表没货 4代表异常 2代表没货且没符合的书 3代表搜索不到 :param driver: :param book_name: :return: ''' global time1,time2,isbn_num,xianzhi_flag # try: # browser.switch_to.window(Handles2[-1]) # except: # return ('999', '4',book_sale_price,sale_num,0) # time.sleep(1) try: driver.find_element_by_class_name("search-input").click() time.sleep(random.randint(1,3)) driver.find_element_by_class_name("search-input").clear() time.sleep(0.5) driver.find_element_by_class_name("search-input").send_keys(book_name[0]) time.sleep(random.randint(1,3)) driver.find_element_by_class_name("search-input").send_keys(Keys.ENTER) time.sleep(random.randint(2,3)) except: time.sleep(120) return ('999', '4',book_sale_price,sale_num,0) try: search_result = driver.find_element_by_class_name("search-results") except: return ('999', '4',book_sale_price,sale_num,0) # 有货数量 book_true = search_result.find_element_by_class_name("instock-list") book_true_num = book_true.find_elements_by_class_name("search_result_item") #没货数量 book_false = search_result.find_element_by_class_name("result-list") book_flase_num = book_false.find_elements_by_class_name("search_result_item") #有货,且有货数量为1 if len(book_true_num) == 1 and len(book_flase_num) ==0: book_true_num[0].find_element_by_class_name("SearchBookItem-title").click() time.sleep(random.randint(1,2)) return check_isbn(driver,'0有货',book_sale_price,sale_num,book_isbn) # 有货,且有货数量大于1,且没货 elif len(book_true_num) > 1 and len(book_flase_num) ==0: book_num = 0 for index,i in enumerate(book_true_num): book_num = book_num + 1 if book_num == len(book_true_num): break try: title = i.find_element_by_class_name("SearchBookItem-title").text desc = i.find_element_by_class_name("SearchBookItem-description").text if book_name[0] in title or title in book_name[0] or book_name[1] in desc: i.find_element_by_class_name("SearchBookItem-title").click() time.sleep(random.randint(1,2)) result = check_msg(driver, book_name, '0有货',book_sale_price,sale_num,book_isbn) if result[0] == '999': continue else: return result else: continue except: continue time1 = int(time.time()) if time2 + time_flag >= time1: print("间隔时间不够,强制等待") print(time1, time2) time.sleep(time_flag + time2 - time1) print("此次采集isbn间隔时间:" + str(time_flag) + "秒") else: print("间隔时间足够,开始采集") print(time1, time2) print("此次采集isbn间隔时间:" + str(time1 - time2) + "秒") time2 = time1 driver.find_element_by_class_name("search-input").click() time.sleep(random.randint(1,3)) driver.find_element_by_class_name("search-input").clear() time.sleep(random.randint(1,2)) driver.find_element_by_class_name("search-input").send_keys(book_isbn) time.sleep(random.randint(1,2)) driver.find_element_by_class_name("search-input").send_keys(Keys.ENTER) time.sleep(random.randint(2,3)) isbn_num = isbn_num + 1 try: search_result = driver.find_element_by_class_name("search-results") except: return ('999', '4','0','0',0) book_true = search_result.find_element_by_class_name("instock-list") book_true_num = book_true.find_elements_by_class_name("search_result_item") # 没货数量 book_false = search_result.find_element_by_class_name("result-list") book_flase_num = book_false.find_elements_by_class_name("search_result_item") if len(book_true_num) == 1 and len(book_flase_num) == 0: book_true_num[0].find_element_by_class_name("SearchBookItem-title").click() time.sleep(1) return check_isbn(driver,'0有货',book_sale_price,sale_num,book_isbn) elif len(book_true_num) > 1 and len(book_flase_num) ==0: time.sleep(1) # win2 = browser.window_handles[0] # browser.switch_to.window(win2) return ('999', '1有货大于1',book_sale_price,sale_num,0) elif len(book_true_num) > 0 and len(book_flase_num) > 0: return ('999', '1有货没货都有',book_sale_price,sale_num,0) elif len(book_true_num) == 0 and len(book_flase_num) == 1: book_flase_num[0].find_element_by_class_name("SearchBookItem-title").click() time.sleep(1) return check_isbn(driver, '2没货',book_sale_price,sale_num,book_isbn) elif len(book_true_num) == 0 and len(book_flase_num) > 1: return ('999', '2没货',book_sale_price,sale_num,0) elif len(book_true_num) == 0 and len(book_flase_num) == 0: print("此书无数据") xianzhi_flag = xianzhi_flag + 1 return ('999', '3无数据',book_sale_price,sale_num,0) elif len(book_true_num) > 0 and len(book_flase_num) > 0: book_num = 0 for index,i in enumerate(book_true_num): book_num = book_num + 1 if book_num == len(book_true_num): break try: try: search_result = driver.find_element_by_class_name("search-results") except: return ('999', '4',book_sale_price,sale_num,0) booktrue = search_result.find_element_by_class_name("instock-list") true_num = booktrue.find_elements_by_class_name("search_result_item") title = true_num[index].find_element_by_class_name("SearchBookItem-title").text desc = true_num[index].find_element_by_class_name("SearchBookItem-description").text print(book_name) print(title,desc) if book_name[0] in title or title in book_name[0]: if book_name[1] in desc: true_num[index].find_element_by_class_name("SearchBookItem-title").click() time.sleep(1) result = check_msg(driver, book_name, '0有货',book_sale_price,sale_num,book_isbn) if result[0] == '999': continue else: return result else: continue else: continue except: continue book_num = 0 for index, i in enumerate(book_flase_num): book_num = book_num + 1 if book_num == len(book_flase_num): break try: try: search_result = driver.find_element_by_class_name("search-results") except: return ('999', '4',book_sale_price,sale_num,0) bookfalse = search_result.find_element_by_class_name("result-list") flase_num = bookfalse.find_elements_by_class_name("search_result_item") title = flase_num[index].find_element_by_class_name("SearchBookItem-title").text desc = flase_num[index].find_element_by_class_name("SearchBookItem-description").text print(book_name) print(title, desc) if book_name[0] in title or title in book_name[0]: if book_name[1] in desc: flase_num[index].find_element_by_class_name("SearchBookItem-title").click() time.sleep(1) result = check_msg(driver, book_name, '1没货',book_sale_price,sale_num,book_isbn) print(result[0]) if result[0] == '999': continue else: return result else: continue else: continue except: continue time1 = int(time.time()) if time2 + time_flag >= time1: print("间隔时间不够,强制等待") print(time1, time2) time.sleep(time_flag + time2 - time1) print("此次采集isbn间隔时间:" + str(time_flag) + "秒") else: print("间隔时间足够,开始采集") print(time1, time2) print("此次采集isbn间隔时间:" + str(time1 - time2) + "秒") time2 = time1 driver.find_element_by_class_name("search-input").click() time.sleep(0.5) driver.find_element_by_class_name("search-input").clear() time.sleep(0.5) driver.find_element_by_class_name("search-input").send_keys(book_isbn) time.sleep(0.5) driver.find_element_by_class_name("search-input").send_keys(Keys.ENTER) time.sleep(2) isbn_num = isbn_num + 1 try: search_result = driver.find_element_by_class_name("search-results") except: return ('999', '4',book_sale_price,sale_num,0) book_true = search_result.find_element_by_class_name("instock-list") book_true_num = book_true.find_elements_by_class_name("search_result_item") # 没货数量 book_false = search_result.find_element_by_class_name("result-list") book_flase_num = book_false.find_elements_by_class_name("search_result_item") if len(book_true_num) == 1 and len(book_flase_num) == 0: book_true_num[0].find_element_by_class_name("SearchBookItem-title").click() time.sleep(1) return check_isbn(driver, '0有货',book_sale_price,sale_num,book_isbn) elif len(book_true_num) > 1 and len(book_flase_num) ==0: time.sleep(1) # win2 = browser.window_handles[0] # browser.switch_to.window(win2) return ('999', '1有货大于1',book_sale_price,sale_num,0) elif len(book_true_num) > 0 and len(book_flase_num) > 0: return ('999', '1有货没货都有',book_sale_price,sale_num,0) elif len(book_true_num) == 0 and len(book_flase_num) == 1: book_flase_num[0].find_element_by_class_name("SearchBookItem-title").click() time.sleep(1) return check_isbn(driver, '2没货',book_sale_price,sale_num,book_isbn) elif len(book_true_num) == 0 and len(book_flase_num) > 1: return ('999', '2没货',book_sale_price,sale_num,0) elif len(book_true_num) == 0 and len(book_flase_num) == 0: print("此书无数据") xianzhi_flag = xianzhi_flag + 1 return ('999', '3无数据',book_sale_price,sale_num,0) # 没货,且没货数量为1 elif len(book_true_num) == 0 and len(book_flase_num) == 1: book_flase_num[0].find_element_by_class_name("SearchBookItem-title").click() time.sleep(1) return check_isbn(driver,'2没货',book_sale_price,sale_num,book_isbn) # 没货,且没货数量大于1 elif len(book_true_num) == 0 and len(book_flase_num) > 1: book_num = 0 for index, i in enumerate(book_flase_num): book_num = book_num + 1 if book_num > len(book_flase_num): break try: try: search_result = driver.find_element_by_class_name("search-results") except: return ('999', '4',book_sale_price,sale_num,0) bookfalse = search_result.find_element_by_class_name("result-list") flase_num = bookfalse.find_elements_by_class_name("search_result_item") title = flase_num[index].find_element_by_class_name("SearchBookItem-title").text desc = flase_num[index].find_element_by_class_name("SearchBookItem-description").text print(book_name) print(title, desc) if book_name[0] in title or title in book_name[0]: if book_name[1] in desc: flase_num[index].find_element_by_class_name("SearchBookItem-title").click() time.sleep(1) result = check_msg(driver, book_name, '2没货',book_sale_price,sale_num,book_isbn) print(result[0]) if result[0] == '999': continue else: return result else: continue else: continue except: continue time1 = int(time.time()) if time2 + time_flag >= time1: print("间隔时间不够,强制等待") print(time1, time2) time.sleep(time_flag + time2 - time1) print("此次采集isbn间隔时间:" + str(time_flag) + "秒") else: print("间隔时间足够,开始采集") print(time1, time2) print("此次采集isbn间隔时间:" + str(time1 - time2) + "秒") time2 = time1 driver.find_element_by_class_name("search-input").click() time.sleep(0.5) driver.find_element_by_class_name("search-input").clear() time.sleep(0.5) driver.find_element_by_class_name("search-input").send_keys(book_isbn) time.sleep(0.5) driver.find_element_by_class_name("search-input").send_keys(Keys.ENTER) time.sleep(2) isbn_num = isbn_num + 1 try: search_result = driver.find_element_by_class_name("search-results") except: return ('999', '4',book_sale_price,sale_num,0) book_true = search_result.find_element_by_class_name("instock-list") book_true_num = book_true.find_elements_by_class_name("search_result_item") # 没货数量 book_false = search_result.find_element_by_class_name("result-list") book_flase_num = book_false.find_elements_by_class_name("search_result_item") if len(book_true_num) == 1 and len(book_flase_num) == 0: book_true_num[0].find_element_by_class_name("SearchBookItem-title").click() time.sleep(1) return check_isbn(driver, '0有货',book_sale_price,sale_num,book_isbn) elif len(book_true_num) > 1 and len(book_flase_num) ==0: time.sleep(1) # win2 = browser.window_handles[0] # browser.switch_to.window(win2) return ('999', '1有货大于1',book_sale_price,sale_num,0) elif len(book_true_num) > 0 and len(book_flase_num) > 0: return ('999', '1有货没货都有',book_sale_price,sale_num,0) elif len(book_true_num) == 0 and len(book_flase_num) == 1: book_flase_num[0].find_element_by_class_name("SearchBookItem-title").click() time.sleep(1) return check_isbn(driver, '2没货',book_sale_price,sale_num,book_isbn) elif len(book_true_num) == 0 and len(book_flase_num) > 1: return ('999', '2没货',book_sale_price,sale_num) elif len(book_true_num) == 0 and len(book_flase_num) == 0: print("此书无数据") xianzhi_flag = xianzhi_flag + 1 return ('999', '3无数据',book_sale_price,sale_num,0) # 没书 elif len(book_true_num) == 0 and len(book_flase_num) == 0: print("此书无数据") return ('999','3无数据',book_sale_price,sale_num,0) def get_book_sale_price(url,browser): ''' 获取价格 :param url: :param browser: :param book_title: :return: ''' try: browser.get(url) # time.sleep(6) element = WebDriverWait(browser, 10).until( EC.presence_of_element_located((By.CLASS_NAME, 'product-item-box')) ) result_list = browser.find_element_by_class_name("product-item-box") price = float(result_list.find_element_by_class_name("price-info").text.replace("¥", "")) price2 = float(result_list.find_elements_by_class_name("price-info")[1].text.replace("¥", "")) price3 = float(result_list.find_elements_by_class_name("price-info")[2].text.replace("¥", "")) try: kuaidi = result_list.find_element_by_class_name("ship-fee-box").text if "包邮" not in kuaidi: kuaidi = float(kuaidi[kuaidi.index("¥")+1:kuaidi.index(".")+2]) else: kuaidi = 0 except: kuaidi = kuaidi_price try: kuaidi2 = result_list.find_elements_by_class_name("ship-fee-box")[1].text if "包邮" not in kuaidi2: kuaidi2 = float(kuaidi2[kuaidi2.index("¥")+1:kuaidi2.index(".")+2]) else: kuaidi2 = 0 except: kuaidi2 = kuaidi_price try: kuaidi3 = result_list.find_elements_by_class_name("ship-fee-box")[2].text if "包邮" not in kuaidi3: kuaidi3 = float(kuaidi3[kuaidi3.index("¥")+1:kuaidi3.index(".")+2]) else: kuaidi3 = 0 except: kuaidi3 = kuaidi_price price_final = 0 price_final1 = price + kuaidi price_final2 = price2 + kuaidi2 price_final3 = price3 + kuaidi3 if price_final2 - price_final1 > 5: price_final = price_final2 elif price_final3 - price_final2 > 5: price_final = price_final3 else: price_final = price_final1 print(price_final1,price_final2,price_final3) book_name = result_list.find_element_by_class_name("item-name").text book_press = result_list.find_elements_by_class_name("zl-info")[1].text book_press_time = result_list.find_elements_by_class_name("zl-info")[2].text return book_name,book_press,book_press_time,price_final except: return '0','0','0',0 def write_scv(path,url,book_sale_price2,book_sale_price,lirun,lirunlv,num): f = open(path, 'a') f.write(url) f.write(',') f.write(str(book_sale_price2)) f.write(',') f.write(str(book_sale_price)) f.write(',') f.write(str(lirun)) f.write(',') f.write(str(lirunlv)) f.write(',') f.write(str(num)) f.write(',') f.write('\n') index_404 = 0 sale_num_404 = 0 def get_book_content(book_con,browser,duozhuayu_caiji,browser2): ''' 第二步,获取每个书籍的图片,作者,出版社,价格等信息 :param url: :return: ''' url = "https://search.kongfz.com/item_result/?status=0&key="+book_con price_flag = 0 if len(book_con) == 13: #获取孔网书籍信息 # book_name = get_book_msg(url,browser) # print(book_name) # try: # #获取孔网书籍售卖价格 # url1 = 'https://search.kongfz.com/product/?keyword=' # url2 = '&quality=90~&quaSelect=2&page=1&actionPath=quality,sortType&sortType=7' # url_t = url1 + str(book_con) + url2 # book_name,book_sale_price = get_book_sale_price(url_t, browser) # print(book_name) # except: # book_name = ('0','0','0') # book_sale_price = 0 #获取孔网书籍售卖价格 url1 = 'https://search.kongfz.com/product/?keyword=' url2 = '&quality=90~&quaSelect=2&page=1&actionPath=quality,sortType&sortType=7' url_t = url1 + str(book_con) + url2 book_name = get_book_sale_price(url_t, browser) print(book_name) book_sale_price = book_name[-1] global index_404,sale_num_404 if book_sale_price == 0: index_404 = index_404 + 1 else: index_404 = 0 print(book_sale_price) print(type(book_sale_price)) if book_sale_price < book_mix_price and book_sale_price >= 0: time.sleep(10) return False # elif book_sale_price == 0: # book_sale_price = book_con[1] # price_flag = 1 time.sleep(2) #获取已售数据 sale_url1 = 'https://search.kongfz.com/product/?dataType=1&keyword=' sale_url2 = '&sortType=10&page=1&actionPath=sortType' sale_url = sale_url1 + str(book_con) + sale_url2 num = get_book_sale(browser,sale_url) time.sleep(2) # print(num) price_url1 = 'https://search.kongfz.com/product/?keyword=' price_url2 = '&quality=90~&quaSelect=2&page=1&actionPath=quality,sortType&sortType=7' price_url = price_url1 + str(book_con) + price_url2 if num == 0: sale_num_404 = sale_num_404 + 1 else: sale_num_404 = 0 #销量达标数据写入 if num >= mix_number: f = open("销量达标.csv", 'a') f.write(price_url) f.write(',') f.write(book_con) f.write(',') f.write(str(book_sale_price)) f.write(',') f.write(str(num)) f.write('\n') #孔网数据传给多抓鱼,进行合格书籍寻找 if num >= mix_number and duozhuayu_caiji == 1: duozhuayu_price = open_url(browser2, book_con,book_name,book_sale_price,num) if duozhuayu_price[-1] != 0 or '有货' in duozhuayu_price[1]: return True else: return False elif num < mix_number and num >= mix_number2 and duozhuayu_caiji == 1: if float(book_sale_price) >= book_mix_price2: duozhuayu_price = open_url(browser2, book_con, book_name,book_sale_price,num) if duozhuayu_price[-1] != 0 or '有货' in duozhuayu_price[1]: return True else: return False else: return False else: return False def get_txt(url,content): f = open(url,'r') write_f = open(url, 'a') msg = f.readlines() #print(msg) url_href = content + '\n' #print(content) if content not in msg: write_f.write(url_href) write_f.write('\n') return True else: return False def delete_data(delete_num,browser): ''' 删除数据 :param delete_num: :param browser: :return: ''' temp_height = 0 if delete_num > 0: while True: # 循环将滚动条下拉 browser.execute_script("window.scrollBy(0,800)") # sleep一下让滚动条反应一下 time.sleep(1) # 获取当前滚动条距离顶部的距离 check_height = browser.execute_script( "return document.documentElement.scrollTop || window.pageYOffset || document.body.scrollTop;") # 如果两者相等说明到底了 if check_height == temp_height: break temp_height = check_height print(check_height) time.sleep(5) for i in range(0,delete_num): browser.find_elements_by_class_name("wishlist-row")[-2].find_element_by_tag_name("g").click() time.sleep(1) # def jiance_open_chrome(): # ''' # 启动浏览器 # :return: # ''' # # 获取浏览器cookie # user_data_dir = r'--user-data-dir=C:\Users\S2020\AppData\Local\Google\Chrome\User Data' # # # 加载配置数据 # option = webdriver.ChromeOptions() # # option.add_argument('-ignore-certificate-errors') # # option.add_argument('-ignore -ssl-errors') # option.add_argument('--ignore-certificate-errors') #忽略CERT证书错误 # # option.add_argument('--ignore-ssl-errors') #忽略SSL错误 # # option.add_argument('--disable-gpu') # # option.add_argument('--ignore-certificate-errors-spki-list') # # option.add_argument('--ignore-urlfetcher-cert-requests') # # capability = option.to_capabilities() # # capability["acceptInsecureCerts"] = True # # capability['acceptSslCerts'] = True # option.add_argument('log-level=2') # # option.add_argument(user_data_dir) # browser = webdriver.Chrome(chrome_options=option,desired_capabilities=capability) # => 注意这里的参数 # # # browser = webdriver.Chrome() # browser.get('https://www.duozhuayu.com/cart') # browser.implicitly_wait(5) # time.sleep(5) # return browser # def jiance_get_book_content(browser): # ''' # 获取有货数据 # :param browser: # :return: # ''' # sell_books = browser.find_elements_by_class_name("SelItem")[0:-1] # books = [] # for i in sell_books: # if '已预订' in i.find_element_by_class_name("action").text: # continue # book_title = i.find_element_by_class_name("book-title").text # middle_section = i.find_element_by_class_name("middle-section").text # Price = i.find_element_by_class_name("Price").text # action = i.find_element_by_class_name("action").text # if '已锁定' not in action: # books.append([book_title,middle_section,Price,action]) # return books # # # # def deal_data(data1,data2): # ''' # 对比前后数据不一样的地方,查看品相,合格得返回,不合格得不处理 # :param data1: # :param data2: # :return: # ''' # # new_data = [] # qty_good = ['品相良好','轻度污渍','轻度磨损或破损'] # #需要支付的数据 # for i in data2: # if i not in data1 and i[1] in qty_good: # new_data.append(i) # # # #未支付的数据被释放 # for a in data1: # for b in data2: # if a[3] == '支付未完成,再次支付': # if a[0] == b[0] and b[3] != '支付未完成,再次支付': # new_data.append(a) # print(new_data) # return new_data # # # # def new_order(browser,j): # ''' # 对应数据下单 # :param browser: # :return: # ''' # sell_books = browser.find_elements_by_class_name("SelItem")[0:-1] # a = 0 # order_data = [] # # for j in data: # print(j) # for i in range(0,len(sell_books)): # # print (sell_books[i].find_element_by_class_name("book-title").text) # if sell_books[i].find_element_by_class_name("book-title").text == j[0]: # # print(1) # if j[3] != '支付未完成,再次支付': # action = sell_books[i].find_element_by_class_name("action") # browser.execute_script('arguments[0].scrollIntoView(true)', action) # action.click() # time.sleep(0.5) # sell_books[i].find_element_by_xpath("//*[contains(text(),'支付锁定款')]").click() # time.sleep(1) # sell_books[i].find_elements_by_xpath("//*[contains(text(),'微信支付')]")[-1].click() # time.sleep(1) # browser.refresh() # time.sleep(3) # sell_books = browser.find_elements_by_class_name("SelItem")[0:-1] # a = 1 # order_data.append(j) # else: # #删除不需要支付的数据 # # browser.execute_script('arguments[0].scrollIntoView()',elm) # 此处(200,0)为网页偏移坐标,200为横坐标,0为纵坐标 # elm = sell_books[i].find_element_by_tag_name("g") # browser.execute_script('arguments[0].scrollIntoView(true)',elm) # time.sleep(1) # elm.click() # time.sleep(1) # if a == 0: # print('无数据要处理') # return False # else: # print(msg_data,end='') # print("处理成功") # return order_data # # return data # # def send_dingding(server_msg): # ''' # 发送短信给主人 # :return: # ''' # # 定义请求的URL # timestamp = str(round(time.time() * 1000)) # url = 'https://oapi.dingtalk.com/robot/send?access_token=12b9f3149bdb71dc72023fff597238fac097b14ae0e3fa7925782a2e9c0b629c' # secret = 'SEC50fd9af520785708023669288488b02ce2d196f084f05b5f8f7babd899af1d0d' # secret_enc = secret.encode('utf-8') # string_to_sign = '{}\n{}'.format(timestamp, secret) # string_to_sign_enc = string_to_sign.encode('utf-8') # hmac_code = hmac.new(secret_enc, string_to_sign_enc, digestmod=hashlib.sha256).digest() # sign = quote_plus(base64.b64encode(hmac_code)) # # headers = {'Content-Type': 'application/json'} # webhook = url + '×tamp=' + timestamp + '&sign=' + sign # # msg_data = '以下书籍已下单,请尽快核实付款:\n' # for i in server_msg: # msg_data = msg_data + i[0] + "\n" # # text = msg_data + "\n小闹新通知:您ip为"+ send_data + '的服务器所挂载的账号有新货到了,请尽快下单' # data = { # "msgtype": "text", # "text": { # "content": "%s" % (text) # } # } # value = json.dumps(data) # r = requests.post(webhook, value, headers=headers) # # # def handle_http_request(request): # # 处理HTTP请求 # response = "HTTP/1.1 200 OK\r\n" # response += "Content-Type: text/plain\r\n" # response += "\r\n" # response += "request recv success!" # return response # # def zidongxiadan(): # # browser = jiance_open_chrome() # sale_books1 = jiance_get_book_content(browser) # # server_socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) # server_socket.bind(('', 8000)) # server_socket.listen(1) # print("HTTP server is running at http://localhost:8000") # #标志数据 # while True: # try: # # 接受客户端连接 # client_socket, client_address = server_socket.accept() # # 接收HTTP请求 # request = client_socket.recv(1024).decode() # # 刷新浏览器,并下单记录信息。 # print(request.split(' ')) # # 处理HTTP请求并生成响应 # response = handle_http_request(request) # # 发送HTTP响应 # client_socket.sendall(response.encode()) # # 关闭客户端连接 # client_socket.close() # except: # print("link fail, link again.") # continue # # if msg_data in request: # #刷新浏览器,并获取新的数据,跟旧的数据进行对比 # browser.refresh() # time.sleep(3) # # #获取新的数据 # sale_books2 = jiance_get_book_content(browser) # # #跟旧数据进行对比,获取不一样得数据 # good_data = deal_data(sale_books1,sale_books2) # # for data in good_data: # #下单 # try: # flag_data = new_order(browser, data) # except: # flag_data = False # #重置数据 # #time.sleep(2) # #browser.refresh() # #time.sleep(3) # #sale_books1 = jiance_get_book_content(browser) # # if flag_data: # #发送钉钉消息 # send_dingding(flag_data) # print(data,end=' ') # print("处理成功") # time.sleep(2) # browser.refresh() # time.sleep(3) # sale_books1 = jiance_get_book_content(browser) # else: # sale_books1 = sale_books2 def open_page(): ''' 启动浏览器的基础工作,登录 :return: ''' browser = open_chrome() #browser = webdriver.Chrome() try: browser.get("https://www.kongfz.com/") time.sleep(5) action = ActionChains(browser) logon = browser.find_elements_by_class_name("item-info")[0] action.move_to_element(logon).perform() time.sleep(2) browser.find_element_by_class_name("login-btn").click() time.sleep(3) browser.switch_to.frame(browser.find_element_by_id("iframe_login")) time.sleep(1) global name_index if name_index == len(name_msg)-1: name_index = 0 browser.find_element_by_id("username").send_keys(name_msg[0][0]) time.sleep(0.5) browser.find_element_by_id("password").send_keys(name_msg[0][1]) name_index = name_index + 1 time.sleep(1) browser.find_element_by_class_name("autoLogin").click() time.sleep(1) browser.find_element_by_class_name("login_submit").click() time.sleep(2) browser.switch_to.default_content() except: print("") time.sleep(10) return browser # # def open_page(): # browser = open_chrome() # browser.get("https://www.kongfz.com/") # # try: # action = ActionChains(browser) # logon = browser.find_elements_by_class_name("item-info")[0] # # action.move_to_element(logon).perform() # time.sleep(2) # # browser.find_element_by_xpath("//*[text()='登 录']").click() # browser.find_element_by_class_name("login-btn").click() # time.sleep(3) # browser.switch_to.frame(browser.find_element_by_id("iframe_login")) # time.sleep(1) # browser.find_element_by_id("username").send_keys(name) # time.sleep(0.5) # browser.find_element_by_id("password").send_keys(passwd) # time.sleep(1) # browser.find_element_by_class_name("autoLogin").click() # time.sleep(1) # browser.find_element_by_class_name("login_submit").click() # time.sleep(2) # browser.switch_to.default_content() # except: # print("") # # # js = 'window.open("https://www.duozhuayu.com/search/book/");' # # browser.execute_script(js) # # Handles2 = browser.window_handles # time.sleep(10) # return browser def get_shop_isbn(url,url_shop_id): ret = requests.get(url, headers=headers) ret.encoding = ret.apparent_encoding soup = BeautifulSoup(ret.text, 'html.parser') book_names = soup.findAll(name='a',attrs={"class":"row-name"}) #获取书名 book_content = [] book_url = 'https://book.kongfz.com/' for i in book_names: itemid = i['href'].replace(book_url+url_shop_id, '').replace('/', '') isbn = soup.find(name='div',attrs={"itemid":itemid})['isbn'] # price = soup.find(name='div',attrs={"itemid":itemid})['price'] # price = price[price.find("¥")+1:] # price = price[:price.find("\n")] book_content.append(isbn) return book_content def get_isbn_url(browser,sale_url): ''' 获取指定isbn的已售链接 :return: ''' shop_list = [] browser.get(sale_url) time.sleep(5) a = browser.find_elements_by_class_name("item-link") s = 0 for i in a: if "https://book.kongfz.com/0/" in i.get_attribute("href"): print(i.get_attribute("href")) shop_list.append(i.get_attribute("href")) s = s + 1 if s == 50: break return shop_list #get_isbn_url() def get_sale_shop_url(url): """ 根据已售数据获取店铺id :param url: :return: """ shop_id = "" ret = requests.get(url,headers = headers) ret.encoding = ret.apparent_encoding soup = BeautifulSoup(ret.text, 'html.parser') book_head = soup.find(name='head') # 获取书名 book_metas = book_head.findAll(name='meta') book_meta = book_metas[1].findAll(name='meta') for i in book_meta: i = str(i) if "url=https://shop.kongfz.com/" in i: i = i[i.index("url=https://shop.kongfz.com/")+28:] i = i[:i.index("/")] shop_id = i break return shop_id # print(i) #get_sale_shop_url("https://book.kongfz.com/0/6232010949/?encryptId=1yE5") def get_shop_msg(browser,url): browser.get(url) time.sleep(5) sale_num = browser.find_element_by_class_name("sale-count").text sale_num = sale_num.replace("(","") sale_num = sale_num.replace(")","") sale_num = sale_num.replace("笔","") print(sale_num) url1 = url + "/all/" browser.get(url1) time.sleep(5) tag = browser.find_elements_by_class_name("crumbs-nav-start")[-1].text shop_num = tag.replace("条结果","").replace(" ","") print(shop_num) print("\n") return sale_num,shop_num date_6 = int(get_bak_data(get_now_date(),6)) def get_yesterday(): # 获取当前日期和时间 today = datetime.date.today() yesterday = str(today - datetime.timedelta(days=1)).replace("-","") # print(yesterday) return yesterday def get_book_isbn(url): isbn = "" try: ret = requests.get(url, headers=headers) ret.encoding = ret.apparent_encoding soup = BeautifulSoup(ret.text, 'html.parser') book_head = soup.find(name='head') # 获取书名 book_metas = book_head.findAll(name='meta') book_meta = book_metas[1].findAll(name='meta') for i in book_meta: i = str(i) if "ISBN:" in i: i = i[i.index("ISBN:") + 5:i.index("ISBN:") + 5 +13] # i = i[:i.index("/")] isbn = i break return isbn except: return isbn def get_book_url(url,browser): data_hrefs = [] try: browser.get(url) time.sleep(5) box = browser.find_element_by_class_name("product-item-box") wrap = browser.find_elements_by_class_name("product-item-wrap") for i in wrap: try: href_data = i.find_element_by_class_name("img-box").get_attribute("href") sale_data = i.find_element_by_class_name("sold-time").text.replace("-","").replace(" 已售","") # print(sale_data) data_hrefs.append([href_data,sale_data]) except: continue except: print("except") return data_hrefs def get_book_url_book(url_all): ''' 第一步 获取原始链接中,每页的书籍url :param url_all: :return: ''' url_href = [] ret = requests.get(url_all, headers=headers) ret.encoding = ret.apparent_encoding soup = BeautifulSoup(ret.text, 'html.parser') titles = soup.find(name='div', attrs={"id": "listBox"}).findAll(name='div', attrs={"class": "title"}) # 获取书名 # s = 0 for i in titles: url = i.find(name='a', attrs={"class": "link"}) #print(url['href']) #过滤品相 href = url['href'].replace(".html","") + "_10_4_1.html" url_href.append(href) time.sleep(3) return url_href def get_book_isbn_nook(url,browser): ''' 获取isbn :param url: :param browser: :param book_title: :return: ''' isbn = "" try: browser.get(url) time.sleep(5) isbn_items = browser.find_elements_by_class_name("item") for item in isbn_items: if "ISBN" in item.text: print(item.text) isbn = item.text.replace("ISBN: ","") break return isbn except: return isbn if __name__ == "__main__": # url = "https://search.kongfz.com/product/?keyword=9787544476492&dataType=0&sortType=7&page=1&actionPath=sortType,quality&quality=90~&quaSelect=2" # browser = open_page() # dat = get_book_sale_price(url,browser) # print(dat) # time.sleep(30) # browser = open_chrome2() # browser.get('https://www.duozhuayu.com/books/82080213923530034') # time.sleep(10) # check_msg(browser, ["111","光明日报出版社","2014-12"], 30, 500, 30, book_isbn='9787523152158') # time.sleep(100) # # browser = open_page() # time.sleep(3) browser = open_chrome2() browser.get('https://www.duozhuayu.com/cart/wishlist') browser.implicitly_wait(5) time.sleep(5) # 等待一段时间,待页面加载出来再执行js代码 book_num = browser.find_element_by_class_name("cart-header").find_element_by_class_name("info").text delete_data(int(book_num[5:]) - max_num, browser) browser.quit() time.sleep(10) duozhuayu_caiji = 1 #如果多抓鱼数量不符合要求,那就不进行数据采集 # if int(book_num[5:])- min_num < 0: # duozhuayu_caiji = 1 #all_content = [] #单纯为了打印 duozhuayu_flag = 0 #shop_new_data = [] # browser.switch_to.window(Handles2[0]) today = datetime.date.today() index = 0 isbns = [] page = 1 browser2 = open_chrome2() browser2.get("https://www.duozhuayu.com/search/book/") time.sleep(5) ''' browser = open_page() #采集出版社数据,获取isbn for url1 in urls: while 1: if index == 1: break url = url1 + str(page) data_hrefs = get_book_url(url, browser) page = page + 1 print(data_hrefs) for data in data_hrefs: yesterday_2 = str(today - datetime.timedelta(days=2)).replace("-", "") # print(data) if int(data[1]) <= int(yesterday_2): print(data) index = 1 if data[1] == get_yesterday(): isbn = get_book_isbn(data[0]) print(isbn) if len(isbn) == 13: isbns.append(isbn) # 读取isbn,获取书籍数据 sousuo_flag = 0 for book_isbn in isbns: #采集数量限制标志? if xianzhi_flag >= 30: break #采集isbn最大数量 if isbn_num >= max_isbn_num: break #sousuo_now_time = get_now_time() #孔网搜索数量 sousuo_flag = sousuo_flag + 1 print(book_isbn) if index_404 > 5: index_404 = 0 print("sousuo_flag:"+str(index_404)) print("连续搜索不到目标,更换ip") sousuo_flag = 0 browser.quit() time.sleep(5) browser = open_page() if sousuo_flag > sousuo_num: print("sousuo_flag:"+str(sousuo_flag)) print("达到缺纸,更换ip") sousuo_flag = 0 browser.quit() time.sleep(5) browser = open_page() # try: if get_book_content(book_isbn, browser, duozhuayu_caiji,browser2): duozhuayu_flag = duozhuayu_flag + 1 # except: # print("多抓鱼搜索出问题,重启浏览器") # duozhuayu_flag = duozhuayu_flag + 1 # # browser.quit() # # time.sleep(10) # # browser = open_page() # # Handles2 = browser.window_handles # continue print("多抓鱼对比通过数量:" + str(duozhuayu_flag), "通过isbn搜索数量:" + str(isbn_num)) # all_content.append([shop[0], s, duozhuayu_flag]) if sousuo_flag % 100 == 0: print("重新打开网页避免网页崩溃。") browser.quit() time.sleep(10) # # # browser2 = open_chrome2() # browser2.get("https://www.duozhuayu.com/search/book/") browser = open_chrome() browser.get("https://www.baidu.com") # js = 'window.open("https://www.duozhuayu.com/search/book/");' # browser.execute_script(js) browser.quit() time.sleep(30) ''' #采集类目 # browser = open_page() # # # Handles2 = browser.window_handles # # f = open("test.txt", "w", encoding="utf-8") # f.close() # # for i in range(start_page, end_page + 1): # url_all = url_zong[:-2] + str(i) # # print(url_all) # urls_href = get_book_url_book(url_all) # print(urls_href) # for url_href in urls_href: # book_isbn = get_book_isbn_nook(url_href, browser) # print(book_isbn) # f = open("test.txt", "a", encoding="utf-8") # f.write(book_isbn) # f.write("\n") # f.close() # # if book_isbn == "": # continue # # if xianzhi_flag >= 30: # break # # if isbn_num >= max_isbn_num: # break # # #sousuo_now_time = get_now_time() # # sousuo_flag = sousuo_flag + 1 # print(book_isbn) # # try: # if get_book_content(book_isbn, browser,duozhuayu_caiji): # duozhuayu_flag = duozhuayu_flag + 1 # except: # duozhuayu_flag = duozhuayu_flag + 1 # browser.quit() # time.sleep(10) # browser = open_page() # Handles2 = browser.window_handles # continue # print("多抓鱼对比通过数量:" + str(duozhuayu_flag), "通过isbn搜索数量:" + str(isbn_num)) # # if sousuo_flag % 50 == 0: # print("重新打开网页避免网页崩溃。") # browser.quit() # time.sleep(10) # browser = open_chrome() # browser.get("https://www.baidu.com") # js = 'window.open("https://www.duozhuayu.com/search/book/");' # browser.execute_script(js) # Handles2 = browser.window_handles # browser.quit() # time.sleep(30) #采集店铺 # Handles2 = browser.window_handles # browser2 = open_chrome2() # browser2.get("https://www.duozhuayu.com/search/book/") browser = open_page() shop_data = read_scv(shop_path) shop_new_data = [] shop_index = 0 for shop in shop_data: for i in range(1, 10): isbns = get_shop_isbn(shop_url_comb(shop[0], i), shop[0]) print(isbns) for isbn in isbns: if isbn == "": continue sale_url = "https://search.kongfz.com/product/?dataType=1&keyword=" + str( isbn) + "&sortType=10&page=1&actionPath=sortType" sale_lists = get_isbn_url(browser, sale_url) if len(sale_lists) == 0: # browser.quit() time.sleep(5) # browser = open_page() print(sale_lists) shop_ids = [] for shop_list in sale_lists: if shop_index >= 5: time.sleep(30) print("连续五次采集店铺信息失败,默认ip到期,更换ip") # browser.quit() # time.sleep(5) # browser = open_page() try: shop_id = get_sale_shop_url(shop_list) except: continue print(shop_id) url = "http://shop.kongfz.com/" + str(shop_id) try: sale_num, shop_num = get_shop_msg(browser, url) shop_index = 0 except: print("店铺采集失败") shop_index = shop_index + 1 continue if int(shop_num) >= shop_book_mix_num and int(shop_num) <= shop_book_max_num: print(shop_new_data) if int(sale_num) / int(shop_num) >= shop_sale_num: shop_new_data = [[shop_id, str(date_6), '8'], ] f = open("test.txt","a") f.write(shop_id) f.write('\n') f.close() shiyong_shop_data = [] print(shop_data) for shop in shop_new_data: s = 0 while 1: if s > 20: break s = s + 1 contents = get_book_name_isbn(shop_url_comb(str(shop[0]), s), str(shop[0])) write_content_to_txt(contents, str(shop[0])) print(shop[0], '第' + str(s) + '页数据已采集完成') if int(contents[1]) <= int(shop[1]): break shiyong_shop_data.append([shop[0], get_now_date(), shop[2]]) book_isbns = read_scv(str(shop[0]) + '.csv') # 读完一条数据直接删除加写入 rm_shop_csv(shop[0] + '.csv') # try: # print("---------------------") # print(shiyong_shop_data) # print(shop_data) # print("---------------------") data = deal_shop_id(shiyong_shop_data, shop_data) print(data) f = open(shop_path, "w") f.close() write_new_csv(shop_path, data) shiyong_shop_data = [] # except: # continue sousuo_flag = 0 buhege_book = 0 for book_isbn in book_isbns: if buhege_book >=50: break print("搜索次数:"+str(sousuo_flag)) if index_404 > 15: index_404 = 0 print("sousuo_flag:" + str(index_404)) print("连续搜索不到目标,更换ip") sousuo_flag = 0 # time.sleep(30) browser.quit() time.sleep(10) browser = open_page() if sale_num_404 > 15: index_404 = 0 sale_num_404 = 0 print("sousuo_flag:" + str(index_404)) print("连续搜索不到目标销售量,更换ip") sousuo_flag = 0 time.sleep(5) browser.quit() time.sleep(10) browser = open_page() if sousuo_flag > sousuo_num: print("sousuo_flag:" + str(sousuo_flag)) print("达到缺纸,更换ip") sousuo_flag = 0 time.sleep(5) # browser.quit() # # browser = open_page() if xianzhi_flag >= 30: break if isbn_num >= max_isbn_num: break #sousuo_now_time = get_now_time() #if sousuo_now_time == sousuo_end_time: # break sousuo_flag = sousuo_flag + 1 print(book_isbn) if len(book_isbn[0]) != 13: continue if get_book_content(book_isbn[0], browser, duozhuayu_caiji, browser2): buhege_book = 0 duozhuayu_flag = duozhuayu_flag + 1 else: buhege_book = buhege_book + 1 # # try: # if get_book_content(book_isbn[0], browser, duozhuayu_caiji,browser2): # duozhuayu_flag = duozhuayu_flag + 1 # except: # duozhuayu_flag = duozhuayu_flag + 1 # browser.quit() # time.sleep(10) # browser = open_page() # continue print("多抓鱼对比通过数量:" + str(duozhuayu_flag), "通过isbn搜索数量:" + str(isbn_num)) #all_content.append([shop[0], s, duozhuayu_flag]) # if sousuo_flag % 100 == 0: # print("重新打开网页避免网页崩溃。") # browser.quit() # time.sleep(10) # browser = open_chrome() # browser.get("https://www.baidu.com") # browser2 = open_chrome2() # browser2.get("https://www.duozhuayu.com/search/book/") #sousuo_now_time = get_now_time() # print(now_time) # break #if sousuo_now_time == sousuo_end_time: # browser.quit() #print(all_content) # break if isbn_num >= max_isbn_num: # browser.quit() #print(all_content) break if xianzhi_flag >= 30: # browser.quit() #print(all_content) break # # browser.quit() #print(all_content)