将字符串中的中文(英文)字符串转化为阿拉伯数字

 我们经常在文本检查的时候要查找其中的数字,但是可能文本是中文数字字符串“两亿三千万”等等,但是要转化为阿拉伯数字才能被程序识别,本文提供了一种将中文或者英文转化为阿拉伯数字的一种方法。
其实中文和英文只是我只知道这两种语言,但是从程序设计的角度来看,该方法是用于多种语种,只要将数据库配置好即可

中文转化为阿拉伯数字: 可以通用于不要分隔符区分单词的语种 ,例如 中文、日文、韩文等(配置相应数据库即可)
英文转化为阿拉伯数字: 可以通用语用空格分隔单词的语种,例如 英文、法文、德文等(配置相应的数据库即可)

1、 设计语种对应表

t_SYS_Num

fid

ftext

fvalue

ftype

1

0

1

2

1

1

3

2

1

4

3

1

5

4

1

6

5

1

7

6

1

8

7

1

9

8

1

10

9

1

11

1

1

12

2

1

13

3

1

14

4

1

15

5

1

16

6

1

17

7

1

18

8

1

19

9

1

22

one

1

0

23

two

2

0

24

three

3

0

25

four

4

0

26

five

5

0

27

six

6

0

28

seven

7

0

29

eight

8

0

30

nine

9

0

31

ten

10

0

32

eleven

11

0

33

twelve

12

0

34

thirteen

13

0

35

fourteen

14

0

36

fifteen

15

0

37

sixteen

16

0

38

seventeen

17

0

39

eighteen

18

0

40

nineteen

19

0

41

twenty

20

0

42

twenty-one

21

0

43

twenty-two

22

0

44

twenty-three

23

0

45

twenty-four

24

0

46

twenty-five

25

0

47

twenty-six

26

0

48

twenty-seven

27

0

49

twenty-eight

28

0

50

twenty-nine

29

0

51

thirty

30

0

52

thirty-one

31

0

53

thirty-two

32

0

54

thirty-three

33

0

55

thirty-four

34

0

56

thirty-five

35

0

57

thirty-six

36

0

58

thirty-seven

37

0

59

thirty-eight

38

0

60

thirty-nine

39

0

61

forty

40

0

62

forty-one

41

0

63

forty-two

42

0

64

forty-three

43

0

65

forty-four

44

0

66

forty-five

45

0

67

forty-six

46

0

68

forty-seven

47

0

69

forty-eight

48

0

70

forty-nine

49

0

71

fifty

50

0

72

fifty-one

51

0

73

fifty-two

52

0

74

fifty-three

53

0

75

fifty-four

54

0

76

fifty-five

55

0

77

fifty-six

56

0

78

fifty-seven

57

0

79

fifty-eight

58

0

80

fifty-nine

59

0

81

sixty

60

0

82

sixty-one

61

0

83

sixty-two

62

0

84

sixty-three

63

0

85

sixty-four

64

0

86

sixty-five

65

0

87

sixty-six

66

0

88

sixty-seven

67

0

89

sixty-eight

68

0

90

sixty-nine

69

0

91

seventy

70

0

92

seventy-one

71

0

93

seventy-two

72

0

94

seventy-three

73

0

95

seventy-four

74

0

96

seventy-five

75

0

97

seventy-six

76

0

98

seventy-seven

77

0

99

seventy-eight

78

0

100

seventy-nine

79

0

101

eighty

80

0

102

eighty-one

81

0

103

eighty-two

82

0

104

eighty-three

83

0

105

eighty-four

84

0

106

eighty-five

85

0

107

eighty-six

86

0

108

eighty-seven

87

0

109

eighty-eight

88

0

110

eighty-nine

89

0

111

ninety

90

0

112

ninety-one

91

0

113

ninety-two

92

0

114

ninety-three

93

0

115

ninety-four

94

0

116

ninety-five

95

0

117

ninety-six

96

0

118

ninety-seven

97

0

119

ninety-eight

98

0

120

ninety-nine

99

0

121

10

1

122

10

1

fId:主键序号

fText:语种字符串

fValue:对应值

fType:语种分类

T_SYS_Unit

fid

funit

fvalue

flevel

fisspace

ftype

1

10

1

No

1

2

100

2

No

1

3

1000

3

No

1

4

10000

4

Yes

1

5

亿

100000000

5

Yes

1

6

10

1

No

1

7

100

2

No

1

8

1000

3

No

1

9

10000

4

Yes

1

10

100000000

5

Yes

1

11

hundred

100

1

No

0

12

thousand

1000

2

No

0

13

million

1000000

3

Yes

0

14

billion

1000000000

4

Yes

0

15

and

1

0

No

0

16

hundreds

100

1

No

0

17

thousands

1000

2

No

0

18

millions

1000000

3

Yes

0

19

billions

1000000000

4

Yes

0

fId:主键序号

fUnit:单位

fValue:对应的倍数

fLevel:单位排序级别

fIsspace:是否作为分隔符

fType:语种分类

2、 搜索符合原文中的数字字符串

a)         在文本中查找符合表t_SYS_Num 和t_SYS_Unit 中text的最长的字符串

3、 用递归法分割数字字符串

a)         首先将按照fLevel最大的text进行分割

                                       i.              例如中文: 以“”、“亿” 进行分割

                                     ii.              “三千二百万零二十一亿零五千” ——被分离为 “三千二百万零二十一” 和“五千”

                                    iii.              然后按照下一级的单位进行分离,直到所有的fisspaceTrue的分离完毕

4、 将万以为的数字字符串转化为数字

5、 数字*以前分离出来的单位

6、 将所有的分离开的数字加起来得到最后的数值



代码实现:

 

 /// <summary>
        
/// 数字字符串信息
        
/// </summary>

        public struct NumString
        
{
            
public int Paragraph;
            
public int Start;
            
public int Length;
            
public string numstring;
            
public long Num;

        }


        
/// <summary>
        
/// 语种信息
        
/// </summary>

        public enum LanguageType
        
{  
            English 
= 0,
            Chinese 
= 1,
          
        }

 

 

       private ArrayList GetNumString(string[] content, LanguageType type)
        
{
            
switch (type)
            
{
                
case LanguageType.Chinese:
                    
return GetNumString_CH(content);
                
case LanguageType.English:
                    
return GetNumString_EN(content);
                
default:
                    
return GetNumString_EN(content);

            }


        }


        
        
/// <summary>
        
/// 查找字符串数组中英文字符串
        
/// </summary>
        
/// <param name="content">文本数组</param>
        
/// <returns>数字字符串数组</returns>

        private ArrayList GetNumString_EN(string[] content)
        
{

            
string sql = "select * from t_SYS_Num where ftype =0";
            DataSet ds 
= DCBase.ExecuteQuery(sql);
            sql 
= "select * from t_SYS_Unit where ftype =0";
            DataSet ds1 
= DCBase.ExecuteQuery(sql);
            DataTable dt1 
= ds.Tables[0];
            DataTable dt2 
= ds1.Tables[0];
            DataView dv1 
= dt1.DefaultView;
            DataView dv2 
= dt2.DefaultView;
            ArrayList nubmstring 
= new ArrayList();
            
for (int i = 0; i < content.Length; i++)
            
{
                
if (content[i] == null || content[i] == "")
                    
continue;
                
string[] content_str = content[i].Split(' ');
                
int start = 0;
                
for (int j = 0; j < content_str.Length; j++)
                
{

                    
int length = 0;
                    
int step = 0;
                    dv1.RowFilter 
= "ftext='" + content_str[j] + "'";
                    
//该单词不是数字                   
                    if (dv1.Count == 0)
                    
{
                        start 
+= content_str[j].Length;
                        start
++;
                        
continue;
                    }

                    
else
                    
{
                        length 
+= content_str[j].Length;

                        
for (int n = j + 1; n < content_str.Length; n++)
                        
{
                            dv1.RowFilter 
= "ftext='" + content_str[n] + "'";
                            dv2.RowFilter 
= "funit='" + content_str[n] + "'";
                            
if (dv1.Count == 0 && dv2.Count == 0)
                            
{
                                
break;
                            }

                            
else
                            
{
                                length 
+= content_str[n].Length + 1;
                                step
++;
                            }

                        }

                    }

                    
if (length > 0)
                    
{
                        NumString ns 
= new NumString();
                        ns.Length 
= length;
                        ns.Start 
= start;
                        ns.Paragraph 
= i;
                        ns.numstring 
= content[i].Substring(start, length);
                        ns.Num 
= Splite(ns.numstring, LanguageType.English);
                        nubmstring.Add(ns);
                        j 
+= step;
                        start 
+= length + 1;
                    }

                    
else
                    
{
                        start 
+= content_str[j].Length;
                        start
++;
                    }

                  
                }

               
            }

           
            
return nubmstring;

        }


        
/// <summary>
        
/// 查找字符串数组中中文字符串
        
/// </summary>
        
/// <param name="content">文本数组</param>
        
/// <returns>数字字符串数组</returns>

        private ArrayList GetNumString_CH(string[] content)
        
{
            
            
string sql = "select * from t_SYS_Num where ftype =1";
            DataSet ds 
= DCBase.ExecuteQuery(sql);
            sql 
= "select * from t_SYS_Unit where ftype =1";
            DataSet ds1 
= DCBase.ExecuteQuery(sql);
            DataTable dt1 
= ds.Tables[0];
            DataTable dt2 
= ds1.Tables[0];
            DataView dv1 
= dt1.DefaultView;
            DataView dv2 
= dt2.DefaultView;
            ArrayList nubmstring 
= new ArrayList();
            
for (int i = 0; i < content.Length; i++)
            
{
                
if (content[i] == null || content[i] == "")
                    
continue;
                
string content_str = content[i];
                
for (int j = 0; j < content_str.Length; j++)
                
{
                    
int length = 0;
                    dv1.RowFilter 
= "ftext='" + content_str[j] + "'";
                    
if (dv1.Count == 0)
                        
continue;
                    
else
                    
{
                        length
++;
                        
for (int n = j + 1; n < content_str.Length; n++)
                        
{
                            dv1.RowFilter 
= "ftext='" + content_str[n] + "'";
                            dv2.RowFilter 
= "funit='" + content_str[n] + "'";
                            
if (dv1.Count == 0 && dv2.Count == 0)
                                
break;
                            
else
                            
{
                                length
++;
                            }

                        }

                    }

                    
if (length > 0)
                    
{
                        NumString ns 
= new NumString();
                        ns.Length 
= length;
                        ns.Start 
= j;
                        ns.Paragraph 
= i;
                        ns.numstring 
= content_str.Substring(j, length);
                        ns.Num 
= Splite(ns.numstring, LanguageType.Chinese);
                        nubmstring.Add(ns);
                        j 
+= length;
                    }

                  
                }

              
            }

         
            
return nubmstring;
        }

 

     /// <summary>
        
/// 将中文字符串数组转化为阿拉伯数组
        
/// </summary>
        
/// <param name="number_CH"></param>
        
/// <returns></returns>

        private long[] CH2A(ArrayList number_CH)
        
{
            
long[] num = new long[number_CH.Count];
            
for (int i = 0; i < number_CH.Count; i++)
            
{
                NumString ns 
= (NumString)number_CH[i];
                
string numberstring = ns.numstring;
                num[i] 
= Splite(numberstring, LanguageType.Chinese);
            }


            
return num;
        }


 
/// <summary>
        
/// 分解中文或者英文数字字符串
        
/// </summary>
        
/// <param name="numstring"></param>
        
/// <param name="type"></param>
        
/// <returns></returns>

        private long Splite(string numstring, LanguageType type)
        
{
            
string sql = "select max(flevel) from t_SYS_Unit where  fisspace=true and ftype=" + (int)type;
            
object obj = DCBase.ExecuteScalar(sql);
            
int maxlevel = 0;
            
if (obj != null && obj != DBNull.Value)
                maxlevel 
= (int)obj;

            
return Splite(numstring, type, maxlevel);
        }


        
/// <summary>
        
/// 分解中文或者英文数字字符串
        
/// </summary>
        
/// <param name="numstring"></param>
        
/// <param name="type"></param>
        
/// <returns></returns>

        private long Splite(string numstring, LanguageType type, int maxLevel)
        
{
            
string sql = "select flevel from t_SYS_Unit where  fisspace=true and ftype =" + (int)type + " and flevel<=" + maxLevel + " order by flevel desc";
            
object obj = DCBase.ExecuteScalar(sql);
            
int maxlevel = 0;
            
if (obj != null && obj != DBNull.Value)
            
{
                maxlevel 
= (int)obj;
                sql 
= "select * from t_SYS_Unit where  fisspace=true and ftype =" + (int)type + " and flevel=" + maxlevel + " order by flevel desc";
                DataSet ds 
= DCBase.ExecuteQuery(sql);
                numstring 
= numstring.Trim();
                
if (ds.Tables[0].Rows.Count > 0)
                
{
                    
string numstring2 = numstring;
                    
foreach (DataRow dr in ds.Tables[0].Rows)
                    
{
                        
string splitestr = "";
                        
if (numstring.ToLower().IndexOf(dr["funit"].ToString()) > -1)
                        
{
                            splitestr 
= dr["funit"].ToString();
                            numstring2 
= numstring.ToLower().Replace(splitestr, "");
                            
break;
                        }

                    }

                    
string[] num = numstring2.Split('');

                    
if (num.Length == 1)
                    
{
                        
return Splite(numstring, type, maxLevel - 1);
                    }

                    
else
                    
{

                        
return Splite(num[0], type, maxLevel - 1* (int)ds.Tables[0].Rows[0]["fvalue"+ Splite(num[1], type, maxLevel - 1);

                    }

                }

                
else
                
{
                    
return Tran2A(numstring, type);
                }

            }

            
else
            
{
                
return Tran2A(numstring, type);
            }

        }

 


 

/// <summary>
        
/// 万以内的数字(英文)
        
/// </summary>
        
/// <param name="numstring"></param>
        
/// <returns></returns>

        private int EN2A(string numstring)
        
{
            
string sql = "select * from t_SYS_Num where ftype =0";
            DataSet ds 
= DCBase.ExecuteQuery(sql);
            sql 
= "select * from t_SYS_Unit where  fisspace=false and ftype =0 order by flevel desc";
            DataSet ds1 
= DCBase.ExecuteQuery(sql);
            DataTable dt1 
= ds.Tables[0];
            DataTable dt2 
= ds1.Tables[0];
            
int result = 0;
            
int index = -1;
            
int value = 0;
            
int maxindex = -1;//查找的最优一个单位出现的位置
            string[] numstr = numstring.Split(' ');
            
foreach (DataRow dr in dt2.Rows)
            
{
                index 
= -1;
                value 
= 0;
                
for (int i = 0; i < numstr.Length; i++)
                
{
                    
if (numstr[i].ToLower() == dr["funit"].ToString())
                    
{
                        index 
= i;
                        
break;
                    }

                }

                
if (index > 0)
                
{
                    maxindex 
= index;
                    
foreach (DataRow row in dt1.Rows)
                    
{
                        value 
= 0;
                        
if (numstr[index - 1].ToString() == row["ftext"].ToString())
                        
{
                            value 
= (int)row["fvalue"];
                            
break;
                        }

                    }

                }

                result 
+= value * (int)dr["fvalue"];
            }

            
//个位结尾的情况
            value = 0;
            
//处理个位数
            if (maxindex != numstr.Length - 1)
            
{
                
foreach (DataRow row in dt1.Rows)
                
{
                    
if (numstr[maxindex + 1].ToString() == row["ftext"].ToString())
                    
{
                        value 
= (int)row["fvalue"];
                        
break;
                    }

                }

            }

            
return result + value;

        }

 

 /// <summary>
        
/// 万以内的数字(中文)
        
/// </summary>
        
/// <param name="numstring"></param>
        
/// <returns></returns>

        private long CH2A(string numstring)
        
{
            
string sql = "select * from t_SYS_Num where ftype =1";
            DataSet ds 
= DCBase.ExecuteQuery(sql);
            sql 
= "select * from t_SYS_Unit where  fisspace=false and ftype =1 order by flevel desc";
            DataSet ds1 
= DCBase.ExecuteQuery(sql);
            DataTable dt1 
= ds.Tables[0];
            DataTable dt2 
= ds1.Tables[0];
            
long result = 0;
            
int index = -1;
            
long value = 0;
            
int maxindex = -1;//查找的最优一个单位出现的位置
            foreach (DataRow dr in dt2.Rows)
            
{
                value 
= 0;
                index 
= numstring.IndexOf(dr["funit"].ToString());
                
//如果数字是以单位开头,就在该数字前加“一”,例如“十四” 变成“一十四”;
                if (index == 0)
                
{
                    numstring 
= "" + numstring;
                    index 
= numstring.IndexOf(dr["funit"].ToString());
                }

                
if (index > 0)
                
{
                    maxindex 
= index;
                    
foreach (DataRow row in dt1.Rows)
                    
{
                        value 
= 0;
                        
if (numstring[index - 1].ToString() == row["ftext"].ToString())
                        
{
                            value 
= (int)row["fvalue"];
                            
break;
                        }

                    }

                }

                
else
                    
continue;
                result 
+= value * (int)dr["fvalue"];
            }

            
//个位结尾的情况
            value = 0;
            
//处理个位数
            if (maxindex != numstring.Length - 1)
            
{
                
foreach (DataRow row in dt1.Rows)
                
{
                    
if (numstring[maxindex + 1].ToString() == row["ftext"].ToString())
                    
{
                        value 
= (int)row["fvalue"];
                        
break;
                    }

                }

            }

            
return result + value;

        }


原文地址:https://www.cnblogs.com/moses/p/1060794.html