C#多线程图片爬虫

写了个简单的多线程图片爬虫,整理一下。数据已经爬下来了,图片URL需要自行拼接,首先从Lawyers表中取的RawData字段,RawData中有一个list字段是json格式的数据,需要的只是list中的pic和XZQH字段用于拼接图片地址,拼接URL规则如下:

http://www.xxxxx.cn/imagetype/{model.XZQH.Substring(0,2)}00/lsfw/lsuser/{model.pic.Substring(0,model.pic.LastIndexOf('.'))}/{model.pic.Substring(model.pic.LastIndexOf('.') + 1)}

得到图片URL之后就好说了,接下来的就是常规操作download。线程调度的核心思想是四个线程轮流工作,当一个下完或下载失败后,就移除该线程,并重启新线程重复同样的工作。代码如下:

    public class Main : HandleProgramBase, IHandleProgram
    {
        public readonly IUnitOfWork _iUnitOfWork;

        public Main(IUnitOfWork iUnitOfWork)
        {
            _iUnitOfWork = iUnitOfWork;
        }

        private List<Task> threadManager = new List<Task>();
        private static object locker = new object();
        private static object counter = new object();
        private static ConcurrentQueue<int> counterQueue = new  ConcurrentQueue<int>();
        private static ReaderWriterLockSlim logWriteLock = new ReaderWriterLockSlim();
        private const int total = 150136;
        private static int start = 1;
        private static int downloadNumber = 0;

        public override void Entrance(string[] args)
        {
            var watcher = new Stopwatch();
            watcher.Start();

            while (start < total)
            {
                if (threadManager.Any())
                {
                    Task.WaitAny(threadManager.ToArray());
                    var completedTaskList = threadManager.Where(a => a.IsCompleted).ToList();
                    for (var i = 0; i < completedTaskList.Count; i++)
                    {
                        if (completedTaskList[i].Status == TaskStatus.Faulted || completedTaskList[i].IsCompleted)
                        {
                            threadManager.Remove(completedTaskList[i]);
                            threadManager.Add(Task.Factory.StartNew(DownloadImg));
                            continue;
                        }
                        completedTaskList[i].Dispose();
                    }

                }
                else
                {
                    for (var i = 0; i < 4; i++)
                    {
                        threadManager.Add(Task.Factory.StartNew(DownloadImg));
                    }
                }
            }

            Task.WaitAll(threadManager.ToArray());
            watcher.Stop();

            Console.WriteLine();
            Console.WriteLine("Download Completed.Total time: " + watcher.ElapsedMilliseconds + " ms.");
        }

        private void DownloadImg()
        {
            using (var web = new WebClient())
            {
                var lawyerList = new List<Lawyers>();
                lock (locker)
                {
                    if (start != total)
                    {
                        var end = start + 50 < total ? start + 50 : total;
                        lawyerList = _iUnitOfWork.Implement<Lawyers>(string.Format(Resource.GetPagedLawyer, start, end)).ToList();
                        start = end;
                    }
                }
                if (!lawyerList.Any()) return;
                foreach (var lawyer in lawyerList)
                {
                    var model = JsonConvert.DeserializeObject<RawData>(lawyer.RawData).list;
                    var imgUrl =
                        $"http://www.xxxxx.cn/imagetype/{model.XZQH.Substring(0, 2)}00/lsfw/lsuser/{model.pic.Substring(0, model.pic.LastIndexOf('.'))}/{model.pic.Substring(model.pic.LastIndexOf('.') + 1)}";
                    var savePath = $@"D:ImageTestMulti{lawyer.LawFrimKey}";
                    if (!Directory.Exists(savePath)) Directory.CreateDirectory(savePath);
                    for (int j = 0; j < 10; j++)
                    {
                        try
                        {
                            var position = lawyer.ImageName.LastIndexOf("/", StringComparison.Ordinal);
                            if (position > 0)
                            {
                                //处理 2017-04-19/B748FA5EF1517886AF76A11CDACE5378.png 类文件
                                var folder = savePath + "\" + lawyer.ImageName.Substring(0, position).Replace("/", "");
                                if (!Directory.Exists(folder)) Directory.CreateDirectory(folder);
                            }
                            var fileName = savePath + "\" + lawyer.ImageName.Replace("/", "\");
                            if (!File.Exists(fileName)) web.DownloadFile(imgUrl, fileName);


                            counterQueue.Enqueue(1);

                            //控制台显示下载数量
                            lock (counter)
                            {
                                Console.WriteLine(Resource.Space);
                                Console.SetCursorPosition(0, Console.CursorTop - 1);
                                Console.Write(Resource.DownloadNumber, ++downloadNumber, counterQueue.Count);
                            }
                        }
                        catch (Exception e)
                        {
                            Thread.Sleep(1000);
                            if (j == 9 || ((HttpWebResponse)((WebException)e).Response)?.StatusCode == HttpStatusCode.NotFound)
                            {
                                WriteLog($"{lawyer.ImageName}爬取失败! 错误:{e.Message}当前Id:{lawyer.Id}。");
                                break;
                            }
                            continue;
                        }
                        break;
                    }

                }
            }
        }

        /// <summary>
        /// 写日志
        /// </summary>
        public static void WriteLog(string errMsg)
        {
            DateTime dt = DateTime.Now;
            string filePathName = System.Diagnostics.Process.GetCurrentProcess().MainModule.FileName;
            int pos = filePathName.LastIndexOf("\");
            if (pos != -1)
            {
                filePathName = filePathName.Substring(0, pos + 1);
                filePathName = filePathName + "ErrorLog.txt";
            }
            StreamWriter sw = null;
            try
            {
                logWriteLock.EnterWriteLock();
                if (File.Exists(filePathName))
                {
                    FileInfo mapInfo = new FileInfo(filePathName);
                    long fileSize = mapInfo.Length;
                    sw = fileSize > 5 * 1024 * 1024 ? new StreamWriter(filePathName, false) : new StreamWriter(filePathName, true);
                }
                else
                {
                    sw = new StreamWriter(filePathName, true);
                }
                sw.WriteLine(dt.ToShortDateString() + "  " + dt.ToShortTimeString() + "  " + errMsg);
            }
            catch (Exception ex)
            {
                Console.WriteLine(ex.Message);
            }
            finally
            {
                if (logWriteLock.IsWriteLockHeld)
                    logWriteLock.ExitWriteLock();
                sw?.Close();
            }
        }

        public override string Helper { get; protected set; }
    }

几个实体类:

    public class Lawyers
    {
        public int Id { get; set; }
        public string LawFrimKey { get; set; }
        public string RawData { get; set; }
        public string ImageName { get; set; }
    }

    public class RawData
    {
        public Lawyer list { get; set; }
    }

    public class Lawyer
    {
        public string pic { get; set; }
        public string XZQH { get; set; }
    }

原文地址:https://www.cnblogs.com/ligykq/p/10315480.html