JAVA提取字符串中所有的URL链接,并加上a标签

工具类

Patterns.java

  1 package com.util;
  2 
  3 import java.util.regex.Matcher;
  4 import java.util.regex.Pattern;
  5 
  6 /**
  7  * Commonly used regular expression patterns.
  8  */
  9 public class Patterns {
 10     /**
 11      *  Regular expression to match all IANA top-level domains.
 12      *  List accurate as of 2011/07/18.  List taken from:
 13      *  http://data.iana.org/TLD/tlds-alpha-by-domain.txt
 14      *  This pattern is auto-generated by frameworks/ex/common/tools/make-iana-tld-pattern.py
 15      *
 16      *  @deprecated Due to the recent profileration of gTLDs, this API is
 17      *  expected to become out-of-date very quickly. Therefore it is now
 18      *  deprecated.
 19      */
 20     @Deprecated
 21     public static final String TOP_LEVEL_DOMAIN_STR =
 22             "((aero|arpa|asia|a[cdefgilmnoqrstuwxz])"
 23                     + "|(biz|b[abdefghijmnorstvwyz])"
 24                     + "|(cat|com|coop|c[acdfghiklmnoruvxyz])"
 25                     + "|d[ejkmoz]"
 26                     + "|(edu|e[cegrstu])"
 27                     + "|f[ijkmor]"
 28                     + "|(gov|g[abdefghilmnpqrstuwy])"
 29                     + "|h[kmnrtu]"
 30                     + "|(info|int|i[delmnoqrst])"
 31                     + "|(jobs|j[emop])"
 32                     + "|k[eghimnprwyz]"
 33                     + "|l[abcikrstuvy]"
 34                     + "|(mil|mobi|museum|m[acdeghklmnopqrstuvwxyz])"
 35                     + "|(name|net|n[acefgilopruz])"
 36                     + "|(org|om)"
 37                     + "|(pro|p[aefghklmnrstwy])"
 38                     + "|qa"
 39                     + "|r[eosuw]"
 40                     + "|s[abcdeghijklmnortuvyz]"
 41                     + "|(tel|travel|t[cdfghjklmnoprtvwz])"
 42                     + "|u[agksyz]"
 43                     + "|v[aceginu]"
 44                     + "|w[fs]"
 45                     + "|(u03b4u03bfu03bau03b9u03bcu03ae|u0438u0441u043fu044bu0442u0430u043du0438u0435|u0440u0444|u0441u0440u0431|u05d8u05e2u05e1u05d8|u0622u0632u0645u0627u06ccu0634u06cc|u0625u062eu062au0628u0627u0631|u0627u0644u0627u0631u062fu0646|u0627u0644u062cu0632u0627u0626u0631|u0627u0644u0633u0639u0648u062fu064au0629|u0627u0644u0645u063au0631u0628|u0627u0645u0627u0631u0627u062a|u0628u06beu0627u0631u062a|u062au0648u0646u0633|u0633u0648u0631u064au0629|u0641u0644u0633u0637u064au0646|u0642u0637u0631|u0645u0635u0631|u092au0930u0940u0915u094du0937u093e|u092du093eu0930u0924|u09adu09beu09b0u09a4|u0a2du0a3eu0a30u0a24|u0aadu0abeu0ab0u0aa4|u0b87u0ba8u0bcdu0ba4u0bbfu0bafu0bbe|u0b87u0bb2u0b99u0bcdu0b95u0bc8|u0b9au0bbfu0b99u0bcdu0b95u0baau0bcdu0baau0bc2u0bb0u0bcd|u0baau0bb0u0bbfu0b9fu0bcdu0b9au0bc8|u0c2du0c3eu0c30u0c24u0c4d|u0dbdu0d82u0d9au0dcf|u0e44u0e17u0e22|u30c6u30b9u30c8|u4e2du56fd|u4e2du570b|u53f0u6e7e|u53f0u7063|u65b0u52a0u5761|u6d4bu8bd5|u6e2cu8a66|u9999u6e2f|ud14cuc2a4ud2b8|ud55cuad6d|xn\-\-0zwm56d|xn\-\-11b5bs3a9aj6g|xn\-\-3e0b707e|xn\-\-45brj9c|xn\-\-80akhbyknj4f|xn\-\-90a3ac|xn\-\-9t4b11yi5a|xn\-\-clchc0ea0b2g2a9gcd|xn\-\-deba0ad|xn\-\-fiqs8s|xn\-\-fiqz9s|xn\-\-fpcrj9c3d|xn\-\-fzc2c9e2c|xn\-\-g6w251d|xn\-\-gecrj9c|xn\-\-h2brj9c|xn\-\-hgbk6aj7f53bba|xn\-\-hlcj6aya9esc7a|xn\-\-j6w193g|xn\-\-jxalpdlp|xn\-\-kgbechtv|xn\-\-kprw13d|xn\-\-kpry57d|xn\-\-lgbbat1ad8j|xn\-\-mgbaam7a8h|xn\-\-mgbayh7gpa|xn\-\-mgbbh1a71e|xn\-\-mgbc0a9azcg|xn\-\-mgberp4a5d4ar|xn\-\-o3cw4h|xn\-\-ogbpf8fl|xn\-\-p1ai|xn\-\-pgbs0dh|xn\-\-s9brj9c|xn\-\-wgbh1c|xn\-\-wgbl6a|xn\-\-xkc2al3hye2a|xn\-\-xkc2dl3a5ee0h|xn\-\-yfro4i67o|xn\-\-ygbi2ammx|xn\-\-zckzah|xxx)"
 46                     + "|y[et]"
 47                     + "|z[amw])";
 48 
 49     /**
 50      *  Regular expression pattern to match all IANA top-level domains.
 51      *  @deprecated This API is deprecated. See {@link #TOP_LEVEL_DOMAIN_STR}.
 52      */
 53     @Deprecated
 54     public static final Pattern TOP_LEVEL_DOMAIN =
 55             Pattern.compile(TOP_LEVEL_DOMAIN_STR);
 56 
 57     /**
 58      *  Regular expression to match all IANA top-level domains for WEB_URL.
 59      *  List accurate as of 2011/07/18.  List taken from:
 60      *  http://data.iana.org/TLD/tlds-alpha-by-domain.txt
 61      *  This pattern is auto-generated by frameworks/ex/common/tools/make-iana-tld-pattern.py
 62      *
 63      *  @deprecated This API is deprecated. See {@link #TOP_LEVEL_DOMAIN_STR}.
 64      */
 65     @Deprecated
 66     public static final String TOP_LEVEL_DOMAIN_STR_FOR_WEB_URL =
 67             "(?:"
 68                     + "(?:aero|arpa|asia|a[cdefgilmnoqrstuwxz])"
 69                     + "|(?:biz|b[abdefghijmnorstvwyz])"
 70                     + "|(?:cat|com|coop|c[acdfghiklmnoruvxyz])"
 71                     + "|d[ejkmoz]"
 72                     + "|(?:edu|e[cegrstu])"
 73                     + "|f[ijkmor]"
 74                     + "|(?:gov|g[abdefghilmnpqrstuwy])"
 75                     + "|h[kmnrtu]"
 76                     + "|(?:info|int|i[delmnoqrst])"
 77                     + "|(?:jobs|j[emop])"
 78                     + "|k[eghimnprwyz]"
 79                     + "|l[abcikrstuvy]"
 80                     + "|(?:mil|mobi|museum|m[acdeghklmnopqrstuvwxyz])"
 81                     + "|(?:name|net|n[acefgilopruz])"
 82                     + "|(?:org|om)"
 83                     + "|(?:pro|p[aefghklmnrstwy])"
 84                     + "|qa"
 85                     + "|r[eosuw]"
 86                     + "|s[abcdeghijklmnortuvyz]"
 87                     + "|(?:tel|travel|t[cdfghjklmnoprtvwz])"
 88                     + "|u[agksyz]"
 89                     + "|v[aceginu]"
 90                     + "|w[fs]"
 91                     + "|(?:u03b4u03bfu03bau03b9u03bcu03ae|u0438u0441u043fu044bu0442u0430u043du0438u0435|u0440u0444|u0441u0440u0431|u05d8u05e2u05e1u05d8|u0622u0632u0645u0627u06ccu0634u06cc|u0625u062eu062au0628u0627u0631|u0627u0644u0627u0631u062fu0646|u0627u0644u062cu0632u0627u0626u0631|u0627u0644u0633u0639u0648u062fu064au0629|u0627u0644u0645u063au0631u0628|u0627u0645u0627u0631u0627u062a|u0628u06beu0627u0631u062a|u062au0648u0646u0633|u0633u0648u0631u064au0629|u0641u0644u0633u0637u064au0646|u0642u0637u0631|u0645u0635u0631|u092au0930u0940u0915u094du0937u093e|u092du093eu0930u0924|u09adu09beu09b0u09a4|u0a2du0a3eu0a30u0a24|u0aadu0abeu0ab0u0aa4|u0b87u0ba8u0bcdu0ba4u0bbfu0bafu0bbe|u0b87u0bb2u0b99u0bcdu0b95u0bc8|u0b9au0bbfu0b99u0bcdu0b95u0baau0bcdu0baau0bc2u0bb0u0bcd|u0baau0bb0u0bbfu0b9fu0bcdu0b9au0bc8|u0c2du0c3eu0c30u0c24u0c4d|u0dbdu0d82u0d9au0dcf|u0e44u0e17u0e22|u30c6u30b9u30c8|u4e2du56fd|u4e2du570b|u53f0u6e7e|u53f0u7063|u65b0u52a0u5761|u6d4bu8bd5|u6e2cu8a66|u9999u6e2f|ud14cuc2a4ud2b8|ud55cuad6d|xn\-\-0zwm56d|xn\-\-11b5bs3a9aj6g|xn\-\-3e0b707e|xn\-\-45brj9c|xn\-\-80akhbyknj4f|xn\-\-90a3ac|xn\-\-9t4b11yi5a|xn\-\-clchc0ea0b2g2a9gcd|xn\-\-deba0ad|xn\-\-fiqs8s|xn\-\-fiqz9s|xn\-\-fpcrj9c3d|xn\-\-fzc2c9e2c|xn\-\-g6w251d|xn\-\-gecrj9c|xn\-\-h2brj9c|xn\-\-hgbk6aj7f53bba|xn\-\-hlcj6aya9esc7a|xn\-\-j6w193g|xn\-\-jxalpdlp|xn\-\-kgbechtv|xn\-\-kprw13d|xn\-\-kpry57d|xn\-\-lgbbat1ad8j|xn\-\-mgbaam7a8h|xn\-\-mgbayh7gpa|xn\-\-mgbbh1a71e|xn\-\-mgbc0a9azcg|xn\-\-mgberp4a5d4ar|xn\-\-o3cw4h|xn\-\-ogbpf8fl|xn\-\-p1ai|xn\-\-pgbs0dh|xn\-\-s9brj9c|xn\-\-wgbh1c|xn\-\-wgbl6a|xn\-\-xkc2al3hye2a|xn\-\-xkc2dl3a5ee0h|xn\-\-yfro4i67o|xn\-\-ygbi2ammx|xn\-\-zckzah|xxx)"
 92                     + "|y[et]"
 93                     + "|z[amw]))";
 94 
 95     /**
 96      *  Regular expression to match all IANA top-level domains.
 97      *
 98      *  List accurate as of 2015/11/24.  List taken from:
 99      *  http://data.iana.org/TLD/tlds-alpha-by-domain.txt
100      *  This pattern is auto-generated by frameworks/ex/common/tools/make-iana-tld-pattern.py
101      *
102      *  @hide
103      */
104     static final String IANA_TOP_LEVEL_DOMAINS =
105             "(?:"
106                     + "(?:aaa|aarp|abb|abbott|abogado|academy|accenture|accountant|accountants|aco|active"
107                     + "|actor|ads|adult|aeg|aero|afl|agency|aig|airforce|airtel|allfinanz|alsace|amica|amsterdam"
108                     + "|android|apartments|app|apple|aquarelle|aramco|archi|army|arpa|arte|asia|associates"
109                     + "|attorney|auction|audio|auto|autos|axa|azure|a[cdefgilmoqrstuwxz])"
110                     + "|(?:band|bank|bar|barcelona|barclaycard|barclays|bargains|bauhaus|bayern|bbc|bbva"
111                     + "|bcn|beats|beer|bentley|berlin|best|bet|bharti|bible|bid|bike|bing|bingo|bio|biz|black"
112                     + "|blackfriday|bloomberg|blue|bms|bmw|bnl|bnpparibas|boats|bom|bond|boo|boots|boutique"
113                     + "|bradesco|bridgestone|broadway|broker|brother|brussels|budapest|build|builders|business"
114                     + "|buzz|bzh|b[abdefghijmnorstvwyz])"
115                     + "|(?:cab|cafe|cal|camera|camp|cancerresearch|canon|capetown|capital|car|caravan|cards"
116                     + "|care|career|careers|cars|cartier|casa|cash|casino|cat|catering|cba|cbn|ceb|center|ceo"
117                     + "|cern|cfa|cfd|chanel|channel|chat|cheap|chloe|christmas|chrome|church|cipriani|cisco"
118                     + "|citic|city|cityeats|claims|cleaning|click|clinic|clothing|cloud|club|clubmed|coach"
119                     + "|codes|coffee|college|cologne|com|commbank|community|company|computer|comsec|condos"
120                     + "|construction|consulting|contractors|cooking|cool|coop|corsica|country|coupons|courses"
121                     + "|credit|creditcard|creditunion|cricket|crown|crs|cruises|csc|cuisinella|cymru|cyou|c[acdfghiklmnoruvwxyz])"
122                     + "|(?:dabur|dad|dance|date|dating|datsun|day|dclk|deals|degree|delivery|dell|delta"
123                     + "|democrat|dental|dentist|desi|design|dev|diamonds|diet|digital|direct|directory|discount"
124                     + "|dnp|docs|dog|doha|domains|doosan|download|drive|durban|dvag|d[ejkmoz])"
125                     + "|(?:earth|eat|edu|education|email|emerck|energy|engineer|engineering|enterprises"
126                     + "|epson|equipment|erni|esq|estate|eurovision|eus|events|everbank|exchange|expert|exposed"
127                     + "|express|e[cegrstu])"
128                     + "|(?:fage|fail|fairwinds|faith|family|fan|fans|farm|fashion|feedback|ferrero|film"
129                     + "|final|finance|financial|firmdale|fish|fishing|fit|fitness|flights|florist|flowers|flsmidth"
130                     + "|fly|foo|football|forex|forsale|forum|foundation|frl|frogans|fund|furniture|futbol|fyi"
131                     + "|f[ijkmor])"
132                     + "|(?:gal|gallery|game|garden|gbiz|gdn|gea|gent|genting|ggee|gift|gifts|gives|giving"
133                     + "|glass|gle|global|globo|gmail|gmo|gmx|gold|goldpoint|golf|goo|goog|google|gop|gov|grainger"
134                     + "|graphics|gratis|green|gripe|group|gucci|guge|guide|guitars|guru|g[abdefghilmnpqrstuwy])"
135                     + "|(?:hamburg|hangout|haus|healthcare|help|here|hermes|hiphop|hitachi|hiv|hockey|holdings"
136                     + "|holiday|homedepot|homes|honda|horse|host|hosting|hoteles|hotmail|house|how|hsbc|hyundai"
137                     + "|h[kmnrtu])"
138                     + "|(?:ibm|icbc|ice|icu|ifm|iinet|immo|immobilien|industries|infiniti|info|ing|ink|institute"
139                     + "|insure|int|international|investments|ipiranga|irish|ist|istanbul|itau|iwc|i[delmnoqrst])"
140                     + "|(?:jaguar|java|jcb|jetzt|jewelry|jlc|jll|jobs|joburg|jprs|juegos|j[emop])"
141                     + "|(?:kaufen|kddi|kia|kim|kinder|kitchen|kiwi|koeln|komatsu|krd|kred|kyoto|k[eghimnprwyz])"
142                     + "|(?:lacaixa|lancaster|land|landrover|lasalle|lat|latrobe|law|lawyer|lds|lease|leclerc"
143                     + "|legal|lexus|lgbt|liaison|lidl|life|lifestyle|lighting|limited|limo|linde|link|live"
144                     + "|lixil|loan|loans|lol|london|lotte|lotto|love|ltd|ltda|lupin|luxe|luxury|l[abcikrstuvy])"
145                     + "|(?:madrid|maif|maison|man|management|mango|market|marketing|markets|marriott|mba"
146                     + "|media|meet|melbourne|meme|memorial|men|menu|meo|miami|microsoft|mil|mini|mma|mobi|moda"
147                     + "|moe|moi|mom|monash|money|montblanc|mormon|mortgage|moscow|motorcycles|mov|movie|movistar"
148                     + "|mtn|mtpc|mtr|museum|mutuelle|m[acdeghklmnopqrstuvwxyz])"
149                     + "|(?:nadex|nagoya|name|navy|nec|net|netbank|network|neustar|new|news|nexus|ngo|nhk"
150                     + "|nico|ninja|nissan|nokia|nra|nrw|ntt|nyc|n[acefgilopruz])"
151                     + "|(?:obi|office|okinawa|omega|one|ong|onl|online|ooo|oracle|orange|org|organic|osaka"
152                     + "|otsuka|ovh|om)"
153                     + "|(?:page|panerai|paris|partners|parts|party|pet|pharmacy|philips|photo|photography"
154                     + "|photos|physio|piaget|pics|pictet|pictures|ping|pink|pizza|place|play|playstation|plumbing"
155                     + "|plus|pohl|poker|porn|post|praxi|press|pro|prod|productions|prof|properties|property"
156                     + "|protection|pub|p[aefghklmnrstwy])"
157                     + "|(?:qpon|quebec|qa)"
158                     + "|(?:racing|realtor|realty|recipes|red|redstone|rehab|reise|reisen|reit|ren|rent|rentals"
159                     + "|repair|report|republican|rest|restaurant|review|reviews|rich|ricoh|rio|rip|rocher|rocks"
160                     + "|rodeo|rsvp|ruhr|run|rwe|ryukyu|r[eosuw])"
161                     + "|(?:saarland|sakura|sale|samsung|sandvik|sandvikcoromant|sanofi|sap|sapo|sarl|saxo"
162                     + "|sbs|sca|scb|schmidt|scholarships|school|schule|schwarz|science|scor|scot|seat|security"
163                     + "|seek|sener|services|seven|sew|sex|sexy|shiksha|shoes|show|shriram|singles|site|ski"
164                     + "|sky|skype|sncf|soccer|social|software|sohu|solar|solutions|sony|soy|space|spiegel|spreadbetting"
165                     + "|srl|stada|starhub|statoil|stc|stcgroup|stockholm|studio|study|style|sucks|supplies"
166                     + "|supply|support|surf|surgery|suzuki|swatch|swiss|sydney|systems|s[abcdeghijklmnortuvxyz])"
167                     + "|(?:tab|taipei|tatamotors|tatar|tattoo|tax|taxi|team|tech|technology|tel|telefonica"
168                     + "|temasek|tennis|thd|theater|theatre|tickets|tienda|tips|tires|tirol|today|tokyo|tools"
169                     + "|top|toray|toshiba|tours|town|toyota|toys|trade|trading|training|travel|trust|tui|t[cdfghjklmnortvwz])"
170                     + "|(?:ubs|university|uno|uol|u[agksyz])"
171                     + "|(?:vacations|vana|vegas|ventures|versicherung|vet|viajes|video|villas|vin|virgin"
172                     + "|vision|vista|vistaprint|viva|vlaanderen|vodka|vote|voting|voto|voyage|v[aceginu])"
173                     + "|(?:wales|walter|wang|watch|webcam|website|wed|wedding|weir|whoswho|wien|wiki|williamhill"
174                     + "|win|windows|wine|wme|work|works|world|wtc|wtf|w[fs])"
175                     + "|(?:u03b5u03bb|u0431u0435u043b|u0434u0435u0442u0438|u043au043eu043c|u043cu043au0434"
176                     + "|u043cu043eu043d|u043cu043eu0441u043au0432u0430|u043eu043du043bu0430u0439u043d"
177                     + "|u043eu0440u0433|u0440u0443u0441|u0440u0444|u0441u0430u0439u0442|u0441u0440u0431"
178                     + "|u0443u043au0440|u049bu0430u0437|u0570u0561u0575|u05e7u05d5u05dd|u0627u0631u0627u0645u0643u0648"
179                     + "|u0627u0644u0627u0631u062fu0646|u0627u0644u062cu0632u0627u0626u0631|u0627u0644u0633u0639u0648u062fu064au0629"
180                     + "|u0627u0644u0645u063au0631u0628|u0627u0645u0627u0631u0627u062a|u0627u06ccu0631u0627u0646"
181                     + "|u0628u0627u0632u0627u0631|u0628u06beu0627u0631u062a|u062au0648u0646u0633"
182                     + "|u0633u0648u062fu0627u0646|u0633u0648u0631u064au0629|u0634u0628u0643u0629"
183                     + "|u0639u0631u0627u0642|u0639u0645u0627u0646|u0641u0644u0633u0637u064au0646"
184                     + "|u0642u0637u0631|u0643u0648u0645|u0645u0635u0631|u0645u0644u064au0633u064au0627"
185                     + "|u0645u0648u0642u0639|u0915u0949u092e|u0928u0947u091f|u092du093eu0930u0924"
186                     + "|u0938u0902u0917u0920u0928|u09adu09beu09b0u09a4|u0a2du0a3eu0a30u0a24|u0aadu0abeu0ab0u0aa4"
187                     + "|u0b87u0ba8u0bcdu0ba4u0bbfu0bafu0bbe|u0b87u0bb2u0b99u0bcdu0b95u0bc8|u0b9au0bbfu0b99u0bcdu0b95u0baau0bcdu0baau0bc2u0bb0u0bcd"
188                     + "|u0c2du0c3eu0c30u0c24u0c4d|u0dbdu0d82u0d9au0dcf|u0e04u0e2du0e21|u0e44u0e17u0e22"
189                     + "|u10d2u10d4|u307fu3093u306a|u30b0u30fcu30b0u30eb|u30b3u30e0|u4e16u754c"
190                     + "|u4e2du4fe1|u4e2du56fd|u4e2du570b|u4e2du6587u7f51|u4f01u4e1a|u4f5bu5c71"
191                     + "|u4fe1u606f|u5065u5eb7|u516bu5366|u516cu53f8|u516cu76ca|u53f0u6e7e|u53f0u7063"
192                     + "|u5546u57ce|u5546u5e97|u5546u6807|u5728u7ebf|u5927u62ff|u5a31u4e50|u5de5u884c"
193                     + "|u5e7fu4e1c|u6148u5584|u6211u7231u4f60|u624bu673a|u653fu52a1|u653fu5e9c"
194                     + "|u65b0u52a0u5761|u65b0u95fb|u65f6u5c1a|u673au6784|u6de1u9a6cu9521|u6e38u620f"
195                     + "|u70b9u770b|u79fbu52a8|u7ec4u7ec7u673au6784|u7f51u5740|u7f51u5e97|u7f51u7edc"
196                     + "|u8c37u6b4c|u96c6u56e2|u98deu5229u6d66|u9910u5385|u9999u6e2f|ub2f7ub137"
197                     + "|ub2f7ucef4|uc0bcuc131|ud55cuad6d|xbox"
198                     + "|xerox|xin|xn\-\-11b4c3d|xn\-\-1qqw23a|xn\-\-30rr7y|xn\-\-3bst00m|xn\-\-3ds443g"
199                     + "|xn\-\-3e0b707e|xn\-\-3pxu8k|xn\-\-42c2d9a|xn\-\-45brj9c|xn\-\-45q11c|xn\-\-4gbrim"
200                     + "|xn\-\-55qw42g|xn\-\-55qx5d|xn\-\-6frz82g|xn\-\-6qq986b3xl|xn\-\-80adxhks"
201                     + "|xn\-\-80ao21a|xn\-\-80asehdb|xn\-\-80aswg|xn\-\-90a3ac|xn\-\-90ais|xn\-\-9dbq2a"
202                     + "|xn\-\-9et52u|xn\-\-b4w605ferd|xn\-\-c1avg|xn\-\-c2br7g|xn\-\-cg4bki|xn\-\-clchc0ea0b2g2a9gcd"
203                     + "|xn\-\-czr694b|xn\-\-czrs0t|xn\-\-czru2d|xn\-\-d1acj3b|xn\-\-d1alf|xn\-\-efvy88h"
204                     + "|xn\-\-estv75g|xn\-\-fhbei|xn\-\-fiq228c5hs|xn\-\-fiq64b|xn\-\-fiqs8s|xn\-\-fiqz9s"
205                     + "|xn\-\-fjq720a|xn\-\-flw351e|xn\-\-fpcrj9c3d|xn\-\-fzc2c9e2c|xn\-\-gecrj9c"
206                     + "|xn\-\-h2brj9c|xn\-\-hxt814e|xn\-\-i1b6b1a6a2e|xn\-\-imr513n|xn\-\-io0a7i"
207                     + "|xn\-\-j1aef|xn\-\-j1amh|xn\-\-j6w193g|xn\-\-kcrx77d1x4a|xn\-\-kprw13d|xn\-\-kpry57d"
208                     + "|xn\-\-kput3i|xn\-\-l1acc|xn\-\-lgbbat1ad8j|xn\-\-mgb9awbf|xn\-\-mgba3a3ejt"
209                     + "|xn\-\-mgba3a4f16a|xn\-\-mgbaam7a8h|xn\-\-mgbab2bd|xn\-\-mgbayh7gpa|xn\-\-mgbbh1a71e"
210                     + "|xn\-\-mgbc0a9azcg|xn\-\-mgberp4a5d4ar|xn\-\-mgbpl2fh|xn\-\-mgbtx2b|xn\-\-mgbx4cd0ab"
211                     + "|xn\-\-mk1bu44c|xn\-\-mxtq1m|xn\-\-ngbc5azd|xn\-\-node|xn\-\-nqv7f|xn\-\-nqv7fs00ema"
212                     + "|xn\-\-nyqy26a|xn\-\-o3cw4h|xn\-\-ogbpf8fl|xn\-\-p1acf|xn\-\-p1ai|xn\-\-pgbs0dh"
213                     + "|xn\-\-pssy2u|xn\-\-q9jyb4c|xn\-\-qcka1pmc|xn\-\-qxam|xn\-\-rhqv96g|xn\-\-s9brj9c"
214                     + "|xn\-\-ses554g|xn\-\-t60b56a|xn\-\-tckwe|xn\-\-unup4y|xn\-\-vermgensberater\-ctb"
215                     + "|xn\-\-vermgensberatung\-pwb|xn\-\-vhquv|xn\-\-vuq861b|xn\-\-wgbh1c|xn\-\-wgbl6a"
216                     + "|xn\-\-xhq521b|xn\-\-xkc2al3hye2a|xn\-\-xkc2dl3a5ee0h|xn\-\-y9a3aq|xn\-\-yfro4i67o"
217                     + "|xn\-\-ygbi2ammx|xn\-\-zfr164b|xperia|xxx|xyz)"
218                     + "|(?:yachts|yamaxun|yandex|yodobashi|yoga|yokohama|youtube|y[et])"
219                     + "|(?:zara|zip|zone|zuerich|z[amw]))";
220 
221     /**
222      * Kept for backward compatibility reasons.
223      *
224      * @deprecated Deprecated since it does not include all IRI characters defined in RFC 3987
225      */
226     @Deprecated
227     public static final String GOOD_IRI_CHAR =
228             "a-zA-Z0-9u00A0-uD7FFuF900-uFDCFuFDF0-uFFEF";
229 
230     public static final Pattern IP_ADDRESS
231             = Pattern.compile(
232             "((25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\.(25[0-5]|2[0-4]"
233                     + "[0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1]"
234                     + "[0-9]{2}|[1-9][0-9]|[1-9]|0)\.(25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}"
235                     + "|[1-9][0-9]|[0-9]))");
236 
237     /**
238      * Valid UCS characters defined in RFC 3987. Excludes space characters.
239      */
240     private static final String UCS_CHAR = "[" +
241             "u00A0-uD7FF" +
242             "uF900-uFDCF" +
243             "uFDF0-uFFEF" +
244             "uD800uDC00-uD83FuDFFD" +
245             "uD840uDC00-uD87FuDFFD" +
246             "uD880uDC00-uD8BFuDFFD" +
247             "uD8C0uDC00-uD8FFuDFFD" +
248             "uD900uDC00-uD93FuDFFD" +
249             "uD940uDC00-uD97FuDFFD" +
250             "uD980uDC00-uD9BFuDFFD" +
251             "uD9C0uDC00-uD9FFuDFFD" +
252             "uDA00uDC00-uDA3FuDFFD" +
253             "uDA40uDC00-uDA7FuDFFD" +
254             "uDA80uDC00-uDABFuDFFD" +
255             "uDAC0uDC00-uDAFFuDFFD" +
256             "uDB00uDC00-uDB3FuDFFD" +
257             "uDB44uDC00-uDB7FuDFFD" +
258             "&&[^u00A0[u2000-u200A]u2028u2029u202Fu3000]]";
259 
260     /**
261      * Valid characters for IRI label defined in RFC 3987.
262      */
263     private static final String LABEL_CHAR = "a-zA-Z0-9" + UCS_CHAR;
264 
265     /**
266      * Valid characters for IRI TLD defined in RFC 3987.
267      */
268     private static final String TLD_CHAR = "a-zA-Z" + UCS_CHAR;
269 
270     /**
271      * RFC 1035 Section 2.3.4 limits the labels to a maximum 63 octets.
272      */
273     private static final String IRI_LABEL =
274             "[" + LABEL_CHAR + "](?:[" + LABEL_CHAR + "\-]{0,61}[" + LABEL_CHAR + "]){0,1}";
275 
276     /**
277      * RFC 3492 references RFC 1034 and limits Punycode algorithm output to 63 characters.
278      */
279     private static final String PUNYCODE_TLD = "xn\-\-[\w\-]{0,58}\w";
280 
281     private static final String TLD = "(" + PUNYCODE_TLD + "|" + "[" + TLD_CHAR + "]{2,63}" +")";
282 
283     private static final String HOST_NAME = "(" + IRI_LABEL + "\.)+" + TLD;
284 
285     public static final Pattern DOMAIN_NAME
286             = Pattern.compile("(" + HOST_NAME + "|" + IP_ADDRESS + ")");
287 
288     private static final String PROTOCOL = "(?i:http|https|rtsp):\/\/";
289 
290     /* A word boundary or end of input.  This is to stop foo.sure from matching as foo.su */
291     private static final String WORD_BOUNDARY = "(?:\b|$|^)";
292 
293     private static final String USER_INFO = "(?:[a-zA-Z0-9\$\-\_\.\+\!\*\'\(\)"
294             + "\,\;\?\&\=]|(?:\%[a-fA-F0-9]{2})){1,64}(?:\:(?:[a-zA-Z0-9\$\-\_"
295             + "\.\+\!\*\'\(\)\,\;\?\&\=]|(?:\%[a-fA-F0-9]{2})){1,25})?\@";
296 
297     private static final String PORT_NUMBER = "\:\d{1,5}";
298 
299     private static final String PATH_AND_QUERY = "\/(?:(?:[" + LABEL_CHAR
300             + "\;\/\?\:\@\&\=\#\~"  // plus optional query params
301             + "\-\.\+\!\*\'\(\)\,\_])|(?:\%[a-fA-F0-9]{2}))*";
302 
303     /**
304      *  Regular expression pattern to match most part of RFC 3987
305      *  Internationalized URLs, aka IRIs.
306      */
307     public static final Pattern WEB_URL = Pattern.compile("("
308             + "("
309             + "(?:" + PROTOCOL + "(?:" + USER_INFO + ")?" + ")?"
310             + "(?:" + DOMAIN_NAME + ")"
311             + "(?:" + PORT_NUMBER + ")?"
312             + ")"
313             + "(" + PATH_AND_QUERY + ")?"
314             + WORD_BOUNDARY
315             + ")");
316 
317     /**
318      * Regular expression that matches known TLDs and punycode TLDs
319      */
320     private static final String STRICT_TLD = "(?:" +
321             IANA_TOP_LEVEL_DOMAINS + "|" + PUNYCODE_TLD + ")";
322 
323     /**
324      * Regular expression that matches host names using {@link #STRICT_TLD}
325      */
326     private static final String STRICT_HOST_NAME = "(?:(?:" + IRI_LABEL + "\.)+"
327             + STRICT_TLD + ")";
328 
329     /**
330      * Regular expression that matches domain names using either {@link #STRICT_HOST_NAME} or
331      * {@link #IP_ADDRESS}
332      */
333     private static final Pattern STRICT_DOMAIN_NAME
334             = Pattern.compile("(?:" + STRICT_HOST_NAME + "|" + IP_ADDRESS + ")");
335 
336     /**
337      * Regular expression that matches domain names without a TLD
338      */
339     private static final String RELAXED_DOMAIN_NAME =
340             "(?:" + "(?:" + IRI_LABEL + "(?:\.(?=\S))" +"?)+" + "|" + IP_ADDRESS + ")";
341 
342     /**
343      * Regular expression to match strings that do not start with a supported protocol. The TLDs
344      * are expected to be one of the known TLDs.
345      */
346     private static final String WEB_URL_WITHOUT_PROTOCOL = "("
347             + WORD_BOUNDARY
348             + "(?<!:\/\/)"
349             + "("
350             + "(?:" + STRICT_DOMAIN_NAME + ")"
351             + "(?:" + PORT_NUMBER + ")?"
352             + ")"
353             + "(?:" + PATH_AND_QUERY + ")?"
354             + WORD_BOUNDARY
355             + ")";
356 
357     /**
358      * Regular expression to match strings that start with a supported protocol. Rules for domain
359      * names and TLDs are more relaxed. TLDs are optional.
360      */
361     private static final String WEB_URL_WITH_PROTOCOL = "("
362             + WORD_BOUNDARY
363             + "(?:"
364             + "(?:" + PROTOCOL + "(?:" + USER_INFO + ")?" + ")"
365             + "(?:" + RELAXED_DOMAIN_NAME + ")?"
366             + "(?:" + PORT_NUMBER + ")?"
367             + ")"
368             + "(?:" + PATH_AND_QUERY + ")?"
369             + WORD_BOUNDARY
370             + ")";
371 
372     /**
373      * Regular expression pattern to match IRIs. If a string starts with http(s):// the expression
374      * tries to match the URL structure with a relaxed rule for TLDs. If the string does not start
375      * with http(s):// the TLDs are expected to be one of the known TLDs.
376      *
377      * @hide
378      */
379     public static final Pattern AUTOLINK_WEB_URL = Pattern.compile(
380             "(" + WEB_URL_WITH_PROTOCOL + "|" + WEB_URL_WITHOUT_PROTOCOL + ")");
381 
382     /**
383      * Regular expression for valid email characters. Does not include some of the valid characters
384      * defined in RFC5321: #&~!^`{}/=$*?|
385      */
386     private static final String EMAIL_CHAR = LABEL_CHAR + "\+\-_%'";
387 
388     /**
389      * Regular expression for local part of an email address. RFC5321 section 4.5.3.1.1 limits
390      * the local part to be at most 64 octets.
391      */
392     private static final String EMAIL_ADDRESS_LOCAL_PART =
393             "[" + EMAIL_CHAR + "]" + "(?:[" + EMAIL_CHAR + "\.]{1,62}[" + EMAIL_CHAR + "])?";
394 
395     /**
396      * Regular expression for the domain part of an email address. RFC5321 section 4.5.3.1.2 limits
397      * the domain to be at most 255 octets.
398      */
399     private static final String EMAIL_ADDRESS_DOMAIN =
400             "(?=.{1,255}(?:\s|$|^))" + HOST_NAME;
401 
402     /**
403      * Regular expression pattern to match email addresses. It excludes double quoted local parts
404      * and the special characters #&~!^`{}/=$*?| that are included in RFC5321.
405      * @hide
406      */
407     public static final Pattern AUTOLINK_EMAIL_ADDRESS = Pattern.compile("(" + WORD_BOUNDARY +
408             "(?:" + EMAIL_ADDRESS_LOCAL_PART + "@" + EMAIL_ADDRESS_DOMAIN + ")" +
409             WORD_BOUNDARY + ")"
410     );
411 
412     public static final Pattern EMAIL_ADDRESS
413             = Pattern.compile(
414             "[a-zA-Z0-9\+\.\_\%\-\+]{1,256}" +
415                     "\@" +
416                     "[a-zA-Z0-9][a-zA-Z0-9\-]{0,64}" +
417                     "(" +
418                     "\." +
419                     "[a-zA-Z0-9][a-zA-Z0-9\-]{0,25}" +
420                     ")+"
421     );
422 
423     /**
424      * This pattern is intended for searching for things that look like they
425      * might be phone numbers in arbitrary text, not for validating whether
426      * something is in fact a phone number.  It will miss many things that
427      * are legitimate phone numbers.
428      *
429      * <p> The pattern matches the following:
430      * <ul>
431      * <li>Optionally, a + sign followed immediately by one or more digits. Spaces, dots, or dashes
432      * may follow.
433      * <li>Optionally, sets of digits in parentheses, separated by spaces, dots, or dashes.
434      * <li>A string starting and ending with a digit, containing digits, spaces, dots, and/or dashes.
435      * </ul>
436      */
437     public static final Pattern PHONE
438             = Pattern.compile(                      // sdd = space, dot, or dash
439             "(\+[0-9]+[\- \.]*)?"        // +<digits><sdd>*
440                     + "(\([0-9]+\)[\- \.]*)?"   // (<digits>)<sdd>*
441                     + "([0-9][0-9\- \.]+[0-9])"); // <digit><digit|sdd>+<digit>
442 
443     /**
444      *  Convenience method to take all of the non-null matching groups in a
445      *  regex Matcher and return them as a concatenated string.
446      *
447      *  @param matcher      The Matcher object from which grouped text will
448      *                      be extracted
449      *
450      *  @return             A String comprising all of the non-null matched
451      *                      groups concatenated together
452      */
453     public static final String concatGroups(Matcher matcher) {
454         StringBuilder b = new StringBuilder();
455         final int numGroups = matcher.groupCount();
456 
457         for (int i = 1; i <= numGroups; i++) {
458             String s = matcher.group(i);
459 
460             if (s != null) {
461                 b.append(s);
462             }
463         }
464 
465         return b.toString();
466     }
467 
468     /**
469      * Convenience method to return only the digits and plus signs
470      * in the matching string.
471      *
472      * @param matcher      The Matcher object from which digits and plus will
473      *                     be extracted
474      *
475      * @return             A String comprising all of the digits and plus in
476      *                     the match
477      */
478     public static final String digitsAndPlusOnly(Matcher matcher) {
479         StringBuilder buffer = new StringBuilder();
480         String matchingRegion = matcher.group();
481 
482         for (int i = 0, size = matchingRegion.length(); i < size; i++) {
483             char character = matchingRegion.charAt(i);
484 
485             if (character == '+' || Character.isDigit(character)) {
486                 buffer.append(character);
487             }
488         }
489         return buffer.toString();
490     }
491 
492     /**
493      * Do not create this static utility class.
494      */
495     private Patterns() {}
496 }


调用方法

       Matcher matcher = Patterns.WEB_URL.matcher(message);
        while (matcher.find()) {
            //循环输出所有匹配到的链接,并加上链接
            String link = matcher.group();
            String restr = "<a href='" + link + "' target='_blank'>" + link + "</a>";
            message = message.replaceAll(link, restr);
        }

  

说明:

  1、message为要进行提取的字符串

  2、http/https可以获取,但是在链接结尾不能和其他文字连在一起,不然获取不准确

  即:"你真的是https://www.cnblogs.com/pxblog博客网"会获取到"https://www.cnblogs.com/pxblog博客网"

      "你真的是https://www.cnblogs.com/pxblog 博客网"会获取到"https://www.cnblogs.com/pxblog"

原文地址:https://www.cnblogs.com/pxblog/p/12610060.html