抓取 在线翻译(Google、Yahoo)

最近公司要求我们做一个调用Google、Yahoo全文翻译的工具,在园子搜索了一下,找到了一篇(不大记得了,好像是VB写的),看了他的做法以后,做了一点点改进,发出来给需要的朋友看看,有什么不对的地方请大家扶正!


using System;
using System.Collections.Generic;
using System.Text;
using System.Net;
using System.IO;
using System.Threading;

namespace Transn
{
    
class TsMachine
    
{

        
public string Google(string texts, string languages)
        
{
            
try
            
{
                
return Google_T(texts, languages);
            }

            
catch
            
{
                Thread.Sleep(
1000);
                
try
                
{
                    
return Google_T(texts, languages);
                }

                
catch
                
{
                    
return "";
                }

            }

        }

        
private string Google_T(string texts, string languages)
        
{
            
string payload = "&text=" + texts + "&langpair=" +languages;
            WebRequest req 
= HttpWebRequest.Create(@"http://translate.google.com/translate_t?hl=zh-CN&ie=utf8");
             
//HttpWebRequest req = reqs as HttpWebRequest;
             
// req.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; Maxthon)";
             
// req.SendChunked = true;
             
// req.TransferEncoding = "UTF-8";
            
            req.Credentials 
= CredentialCache.DefaultNetworkCredentials;
            req.Method 
= "POST";
            req.ContentType 
= "application/x-www-form-urlencoded";
            req.Timeout 
= 50000;
            req.ContentLength 
= payload.Length;
            

            Encoding encoding 
= Encoding.GetEncoding("UTF-8");
            Byte[] bytes 
= encoding.GetBytes(payload);
            req.ContentLength 
= bytes.Length;
            Stream newStream 
= null;

            newStream 
= req.GetRequestStream();

            newStream.Write(bytes, 
0, bytes.Length);
            newStream.Close();
            HttpWebResponse res 
= (HttpWebResponse)req.GetResponse();
            
            
if (res.StatusDescription.ToLower() != "ok")
            
{
                Console.WriteLine(
"无法连接!");
            }

         
            Stream dataStream 
= res.GetResponseStream();


            StreamReader reader 
= new StreamReader(dataStream, Encoding.GetEncoding(res.CharacterSet));

            
string responseFromServer = null;
            responseFromServer 
= reader.ReadToEnd();

            
//Encoding en = Encoding.GetEncoding("gb2312");
            
//byte[] unicodeBytes = en.GetBytes(responseFromServer);

            
//byte[] asciiBytes = Encoding.Convert(en, Encoding.UTF8, unicodeBytes);
            
//str_return = Encoding.UTF8.GetString(asciiBytes);

            reader.Close();
            dataStream.Close();
            res.Close();
            
string content = responseFromServer;
            
string s = "<div id=result_box dir=\"ltr\">";
            
int start = content.IndexOf(s);
            
int end = content.IndexOf("</div>",start);
            content 
= content.Substring(start + s.Length, end - start - s.Length);

            
return content.Replace("&nbsp;"" ").Replace("&quot;""\"").Replace("&gt;"">").Replace("&lt;""<").Replace("<br>""\r\n").Replace("&#160;"" ").Replace("&#39;","'");
        }




        
public string Yahoo(string texts, string languages)
        
{
            
try
            
{
              
return   Yahoo_T(texts, languages);
            }

            
catch
            
{
                Thread.Sleep(
1000);
                
try
                
{
                   
return  Yahoo_T(texts, languages);
                }

                
catch
                
{
                    
return "";
                }

            }

       
        }


        
public string Yahoo_T(string texts, string languages)
        
{
            
string payload = "more=1&ei=UTF-8&trtext=" + texts + "&lp=" + languages + "";
            WebRequest req 
= HttpWebRequest.Create(@"http://fanyi.yahoo.com.cn/translate_txt?");
            
//  HttpWebRequest req = reqs as HttpWebRequest;
            
//  req.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; Maxthon)";
            req.Credentials = CredentialCache.DefaultNetworkCredentials;
            req.Method 
= "POST";
            req.ContentType 
= "application/x-www-form-urlencoded";
            req.Timeout 
= 50000;
            req.ContentLength 
= payload.Length;

            Encoding encoding 
= Encoding.GetEncoding("UTF-8");
            Byte[] bytes 
= encoding.GetBytes(payload);
            req.ContentLength 
= bytes.Length;
            Stream newStream 
= null;

            newStream 
= req.GetRequestStream();

            newStream.Write(bytes, 
0, bytes.Length);
            newStream.Close();
            HttpWebResponse res 
= (HttpWebResponse)req.GetResponse();
            
if (res.StatusDescription.ToLower() != "ok")
            
{
                Console.WriteLine(
"无法连接!");
              
            }

            Stream dataStream 
= res.GetResponseStream();


            StreamReader reader 
= new StreamReader(dataStream, Encoding.GetEncoding(res.CharacterSet));

            
string responseFromServer = null;
            responseFromServer 
= reader.ReadToEnd();

            
//Encoding en = Encoding.GetEncoding("gb2312");
            
//byte[] unicodeBytes = en.GetBytes(responseFromServer);

            
//byte[] asciiBytes = Encoding.Convert(en, Encoding.UTF8, unicodeBytes);
            
//str_return = Encoding.UTF8.GetString(asciiBytes);

            reader.Close();
            dataStream.Close();
            res.Close();

            
string content = responseFromServer;
            
string s = "<div id=\"pd\" class=\"pd\">";
            
int start = content.IndexOf(s);
            
int end = content.IndexOf("</div>\n\t\t\t</div>", start);
            content 
= content.Substring(start + s.Length, end - start - s.Length);
            
return content.Replace("<dnt>""").Replace("</dnt>","").Replace("<br/>","\r\n");

        }

}
加载翻译方向
   struct LanguageType
        
{
            
public string value;
            
public string text;
            
public LanguageType(string v, string t)
            
{
                value 
= v;
                text 
= t;
            }


            
public override string ToString()
            
{
                
return text;
            }

        }


  private void LoadLanguage(ComboBox comboBox)
        
{
            comboBox.Items.Add(
new LanguageType("ar|en""阿拉伯文到英语"));
            comboBox.Items.Add(
new LanguageType("ko|en""朝鲜语到英语"));
            comboBox.Items.Add(
new LanguageType("de|fr""德语到法语"));
            comboBox.Items.Add(
new LanguageType("de|en""德语到英语"));
            comboBox.Items.Add(
new LanguageType("ru|en""俄语到英语"));
            comboBox.Items.Add(
new LanguageType("fr|de""法语到德语"));
            comboBox.Items.Add(
new LanguageType("fr|en""法语到英语"));
            comboBox.Items.Add(
new LanguageType("nl|en""荷兰语到英语"));
            comboBox.Items.Add(
new LanguageType("pt|en""葡萄牙语到英语"));
            comboBox.Items.Add(
new LanguageType("ja|en""日语到英语"));
            comboBox.Items.Add(
new LanguageType("es|en""西班牙语到英语"));
            comboBox.Items.Add(
new LanguageType("el|en""希腊语到英语"));
            comboBox.Items.Add(
new LanguageType("it|en""意大利语到英语"));
            comboBox.Items.Add(
new LanguageType("en|ar""英语到阿拉伯文"));
            comboBox.Items.Add(
new LanguageType("en|ko""英语到朝鲜语"));
            comboBox.Items.Add(
new LanguageType("en|de""英语到德语"));
            comboBox.Items.Add(
new LanguageType("en|ru""英语到俄语"));
            comboBox.Items.Add(
new LanguageType("en|fr""英语到法语"));
            comboBox.Items.Add(
new LanguageType("en|nl""英语到荷兰语"));
            comboBox.Items.Add(
new LanguageType("en|pt""英语到葡萄牙语"));
            comboBox.Items.Add(
new LanguageType("en|ja""英语到日语"));
            comboBox.Items.Add(
new LanguageType("en|es""英语到西班牙语"));
            comboBox.Items.Add(
new LanguageType("en|el""英语到希腊语"));
            comboBox.Items.Add(
new LanguageType("en|it""英语到意大利语"));
            comboBox.Items.Add(
new LanguageType("en|zh-TW""英语到中文(繁体)"));
            comboBox.Items.Add(
new LanguageType("en|zh-CN""英语到中文(简体)"));
            comboBox.Items.Add(
new LanguageType("en|zh-CN""英语到中文"));
            comboBox.Items.Add(
new LanguageType("zh|en""中文到英语"));
            comboBox.Items.Add(
new LanguageType("zh-TW|zh-CN""中文(繁体到简体)"));
            comboBox.Items.Add(
new LanguageType("zh-CN|zh-TW""中文(简体到繁体)"));
        }


调用方法
  void GoogleT(TsMachine tm, string content, string languetype)
        
{
            
string tranlate = tm.Google(content, languetype);
            Google.Text 
= tranlate;
        }


        
void YahooT(TsMachine tm, string content, string languetype)
        
{
            languetype 
= languetype.Replace("|""_").Replace("zh-TW""zt").Replace("zh-CN""zh");
            
string tranlate = tm.Yahoo (content, languetype);
            Yahoo.Text 
= tranlate;
        }



在这个请求翻译的过程中,最麻烦的是编码问题。Yahoo使用的是固定编码格式(Utf-8)、Google就很不老实了,编码格式是变化的,每次调用 StreamReader reader = new StreamReader(dataStream, Encoding.GetEncoding(Utf-8)); 进行解码的时候老是出错,后来才发现每次编码格式都在改变,如果用固定的Utf-8解码,得到的都是乱码
根据多次测试,Google的编码格式和res.CharacterSet一致(不敢保证,但是测试了很多语种都是正确的)。 虽然摆平了Google和Yahoo,但是谷词一直没能搞定

这是我编写的谷词翻译编码

   public string Godict_T(string texts, string languages)
        
{
            
string payload = "from_content=" + texts + "&langpair=" + languages + "";
            WebRequest req 
= HttpWebRequest.Create(@"http://trans.godict.com/index.php");
            req.Credentials 
= CredentialCache.DefaultNetworkCredentials;
            req.Method 
= "POST";
            req.ContentType 
= "application/x-www-form-urlencoded";
            req.Timeout 
= 50000;
            req.ContentLength 
= payload.Length;

            Encoding encoding 
= Encoding.GetEncoding("UTF-8");
            Byte[] bytes 
= encoding.GetBytes(payload);
            req.ContentLength 
= bytes.Length;
            Stream newStream 
= null;

            newStream 
= req.GetRequestStream();

            newStream.Write(bytes, 
0, bytes.Length);
            newStream.Close();
            HttpWebResponse res 
= (HttpWebResponse)req.GetResponse();
            
if (res.StatusDescription.ToLower() != "ok")
            
{
                Console.WriteLine(
"无法连接!");
            }

            Stream dataStream 
= res.GetResponseStream();


            StreamReader reader 
= new StreamReader(dataStream, Encoding.GetEncoding(res.CharacterSet));

            
string responseFromServer = null;
            responseFromServer 
= reader.ReadToEnd();


            reader.Close();
            dataStream.Close();
            res.Close();

            
string content = responseFromServer;
            
string s = "<textarea name=q rows=12 style=\" 740px;\" wrap=PHYSICAL dir=ltr>";
            
int start = content.IndexOf(s);
            
int end = content.IndexOf("</textarea>", start);
            
return content.Substring(start + s.Length, end - start - s.Length);
        }

结果发现这个谷词的res.CharacterSet(相应流)永远是iso-8859-1 ,这就郁闷了。 不知道这个地方如何解码,有知道的朋友请指教 !

原文地址:https://www.cnblogs.com/moses/p/1148735.html