线程间通信共享变量和queue

　　在多线程中，数据是共享，如何在多线程安全的通信，是首先要可虑的问题的

#线程间的通信

import time
import threading
from threading import RLock

detail_url_list = []

lock = RLock()

def get_detail_html(url):
    #爬取文章详情页
    global detail_url_list
    #第一次我的想法也是用for循环，
    # 但是你要知道，爬取文章的列表页要快于爬取文章详情页
    #所以开启多个线程来爬取多个文章详情页
    lock.acquire()
    url = detail_url_list.pop()
    print('get detail html started')
    time.sleep(2)
    print('get detail html end')
    lock.release()
    '''
    for url in detail_url_list:
        print('get detail html started')
        time.sleep(2)
        print('get detail html end')
    '''


def get_detail_url(url):
    #爬取文章列表页
    global detail_url_list
    print('get detail url started')
    time.sleep(4)
    for i in range(20):
        detail_url_list.append('http://projectsedu.com/{id}'.format(id=i))
    print('get detail url end')

#需求就是爬取文章列表页的url给文章详情页的url爬取：
#这个时候，设计到文章间的资源通信

#第一种方法就是  共享变量（共享变量其实就是全局变量，给各个函数调用）
#具体方法如下：


if __name__ == '__main__':
    # thread1 = threading.Thread(target=get_detail_html,args=(('',)))
    for i in range(10):
        thread1 = threading.Thread(target=get_detail_html)
        thread1.start()
    thread2 = threading.Thread(target=get_detail_url,args=(('http://bolezaixian.com',)))
    thread2.start()
    # start_time = time.time()
    # thread1.setDaemon(True)#设置线程1为守护线程
    # thread1.start()
    # thread2.start()
    # thread2.join()
    # print('last time:{}'.format(time.time()-start_time))
共享变量也是要枷锁的。

import threading
from threading import Lock
#把共享变量存在settings配置文件中
import settings
import time

lock = Lock()


def get_detail_html():
    #爬取文章详情页

    detail_url_list=settings.detail_list_url
    #第一次我的想法也是用for循环，
    # 但是你要知道，爬取文章的列表页要快于爬取文章详情页
    #所以开启多个线程来爬取多个文章详情页
    while True:
        try:
            if len(detail_url_list):
                # lock.acquire()
                url = detail_url_list.pop()
                print('get detail html started')
                time.sleep(2)
                print('get detail html end')
                # lock.release()
        except Exception as e:
            print(e)
            print('线程已运行完了')
            break
    '''
    for url in detail_url_list:
        print('get detail html started')
        time.sleep(2)
        print('get detail html end')
    '''


def get_detail_url():
    #爬取文章列表页

    detail_url_list = settings.detail_list_url
    print('get detail url started')
    time.sleep(4)
    for i in range(20):
        detail_url_list.append('http://projectsedu.com/{id}'.format(id=i))
        print('get detail url end')


if __name__ == '__main__':
    start_time = time.time()
    for i in range(10):
        t = threading.Thread(target=get_detail_html)
        t.start()

    t1 = threading.Thread(target=get_detail_url)
    t1.start()
    t1.join()

    print('total_time:{}'.format(time.time()-start_time))

#通过queue的方式进行线程间同步通信

-----------------------------------------------------------------------------------------------------------------

from queue import Queue

import time
import threading


def get_detail_html(queue):
    #爬取文章详情页
    while True:
        url = queue.get() #get（）方法是一个阻塞的方法，如果queue是空队列，它一直会阻塞在这

        print('get detail html started')
        time.sleep(2)
        print('get detail html end')


def get_detail_url(queue):
    #爬取文章列表页

    while True:
        print('get detail url started')
        time.sleep(2)
        for i in range(20):
            queue.put("https://projectsedu.com/{id}".format(id=i))
        print('get detail url end')


if __name__ == "__main__":
    detail_url_queue = Queue(maxsize=1000)#队列里面一定要设置下，maxsize的最大值，防止内存过大

    thread_detail_url = threading.Thread(target=get_detail_url,args=((detail_url_queue,)))

    for i in range(10):
        html_thread = threading.Thread(target=get_detail_html,args=((detail_url_queue,)))
        html_thread.start()

    detail_url_queue.task_done()
    #队列调用join（）方法阻塞在这，只有调用task_done()方法队列才结束，主线程才能运行。
    detail_url_queue.join()

qsize()方法判断队列的大小，empty（）方法判断队列是否为空，如果为空，get（）是会阻塞在哪，full（）方法判断队列是否已满，如果以满，put（）方法是会阻塞在哪的