@codeforces

@description@
@solution@
@accepted code@
@details@

@description@

给定 n 个互不相同的只包含 'a', 'b' 的字符串。

请选出最多的字符串，使得字符串两两之间没有包含关系（即不存在两个字符串 s, t 使得 s 是 t 的子串）。

输出方案。

Input
第一行一个整数 n (1 ≤ n ≤ 750) 表示字符串个数。
接下来 n 行每行一个只包含 'a', 'b' 的字符串。保证 n 个字符串总长不超过 10^7。

Output
第一行输出选出的字符串个数。
第二行输出你选择的字符串的编号，任意方案皆可。

Examples
Input
5
abab
aba
aabab
ababb
bab
Output
2
2 5
Note
3, 4 也是一种合法方案。

@solution@

注意包含关系其实是偏序关系，互相不包含的最大集合其实就是最长反链。

先考虑怎么得到串之间的偏序关系。因为子串总是前缀的某个后缀，所以每一个串向它所有前缀最长出现的后缀连边，最后利用传递性跑个传递闭包即可。
我们考虑对所有串建出 AC 自动机来辅助我们找到某个前缀 S 的最长出现后缀，这个可以通过 AC 自动机的 fail 树向下传递沿着 fail 边跳第一个遇到的完整的串编号实现。
最后再把所有串拿到这个 AC 自动机上跑一跑即可。但是注意某个串本身的最长出现后缀是它自身，这个时候需要沿着 fail 边跳一次再得到此时的最长出现后缀。

最长反链直接转为最小链覆盖然后二分图上跑一跑即可。但是重点在于，这道题还要求输出最长反链的方案。
搜了好久，终于在这篇博客里找到了我想要得到的答案。。。

我们考虑对二分图求出最大独立集，则对于每个点，如果它拆成的两个点都在独立集中，则将它加入反链。至于证明其实比较简单：
（1）首先这样构造出来的肯定是个合法的反链。
（2）假如原图中的点数为 n，最大匹配为 m。则最小链覆盖为 n - m，最大独立集为 2*n - m，拆成的点只有一个在独立集中的点 <= n，于是我们构造出来的反链长度 >= n - m；又因最小链覆盖总比反链长度大，所以有最大反链 <= n - m。于是得证。

那么问题转为怎么求二分图的最大独立集。因为最大独立集与最小点覆盖互为补集，我们考虑怎么去求最小点覆盖。
我们从 X 部的未匹配点出发，沿交错路径（即非匹配边 - 匹配边 - 非匹配边-...）访问点。最后得到的 X 部未访问点 + Y 部访问点即是最小点覆盖。
证明只需要证不存在一条边使得这条边的 X 部访问到且 Y 部未访问到即可，讨论一下这条边是匹配边还是非匹配边。

@accepted code@

#include<cstdio>
#include<queue>
#include<cstring>
#include<bitset>
#include<algorithm>
using namespace std;
const int MAXN = 750;
const int MAXS = int(1E7);
const int MAXV = 2*750;
const int MAXE = MAXV*MAXV + 2*MAXV;
const int INF = (1<<30);
struct FlowGraph{
	struct edge{
		int to, flow, cap;
		edge *nxt, *rev;
	}edges[MAXE + 5], *adj[MAXV + 5], *cur[MAXV + 5], *ecnt;
	FlowGraph() {ecnt = &edges[0];}
	void addedge(int u, int v, int c) {
		edge *p = (++ecnt), *q = (++ecnt);
		p->to = v, p->cap = c, p->flow = 0;
		p->nxt = adj[u], adj[u] = p;
		q->to = u, q->cap = 0, q->flow = 0;
		q->nxt = adj[v], adj[v] = q;
		p->rev = q, q->rev = p;
	}
	int d[MAXV + 5], s, t;
	bool relabel() {
		for(int i=s;i<=t;i++)
			d[i] = t + 5, cur[i] = adj[i];
		queue<int>que;
		d[t] = 0; que.push(t);
		while( !que.empty() ) {
			int f = que.front(); que.pop();
			for(edge *p=adj[f];p;p=p->nxt) {
				if( p->rev->cap > p->rev->flow ) {
					if( d[f] + 1 < d[p->to] ) {
						d[p->to] = d[f] + 1;
						que.push(p->to);
					}
				}
			}
		}
		return !(d[s] == t + 5);
	}
	int aug(int x, int tot) {
		if( x == t ) return tot;
		int sum = 0;
		for(edge *&p=cur[x];p;p=p->nxt) {
			if( p->cap > p->flow && d[p->to] + 1 == d[x] ) {
				int del = aug(p->to, min(tot - sum, p->cap - p->flow));
				sum += del, p->flow += del, p->rev->flow -= del;
				if( sum == tot ) return sum;
			}
		}
		return sum;
	}
	int max_flow(int _s, int _t) {
		s = _s, t = _t; int flow = 0;
		while( relabel() )
			flow += aug(s, INF);
		return flow;
	}
	bool vis[MAXV + 5];
	void dfs(int x) {
		vis[x] = true;
		for(edge *p=adj[x];p;p=p->nxt)
			if( !vis[p->to] && p->cap > p->flow )
				dfs(p->to);
	}
}G;
int n, s, t;
bitset<MAXN + 5>e[MAXN + 5];
struct ACM{
	struct node{
		int ch[2], fail;
		int tag;
	}nd[MAXS + 5];
	int ncnt, root;
	ACM() {ncnt = root = 0;}
	void add(char *S, int lenS, int x) {
		int nw = root;
		for(int i=0;i<lenS;i++) {
			if( !nd[nw].ch[S[i] - 'a'] )
				nd[nw].ch[S[i] - 'a'] = (++ncnt);
			nw = nd[nw].ch[S[i] - 'a'];
		}
		nd[nw].tag = x;
	}
	void build() {
		queue<int>que;
		if( nd[root].ch[0] ) que.push(nd[root].ch[0]);
		if( nd[root].ch[1] ) que.push(nd[root].ch[1]);
		while( !que.empty() ) {
			int f = que.front(); que.pop();
			if( nd[f].ch[0] ) {
				nd[nd[f].ch[0]].fail = nd[nd[f].fail].ch[0];
				if( !nd[nd[f].ch[0]].tag ) nd[nd[f].ch[0]].tag = nd[nd[nd[f].ch[0]].fail].tag;
				que.push(nd[f].ch[0]);
			}
			else nd[f].ch[0] = nd[nd[f].fail].ch[0];
			if( nd[f].ch[1] ) {
				nd[nd[f].ch[1]].fail = nd[nd[f].fail].ch[1];
				if( !nd[nd[f].ch[1]].tag ) nd[nd[f].ch[1]].tag = nd[nd[nd[f].ch[1]].fail].tag;
				que.push(nd[f].ch[1]);
			}
			else nd[f].ch[1] = nd[nd[f].fail].ch[1];
		}
	}
	void solve(char *S, int lenS, int x) {
		int nw = root;
		for(int i=0;i<lenS;i++) {
			if( nd[nw].tag )
				e[x][nd[nw].tag] = true;
			nw = nd[nw].ch[S[i] - 'a'];
		}
		nw = nd[nw].fail;
		if( nd[nw].tag )
			e[x][nd[nw].tag] = true;
	}
}ac;
char S[MAXS + 5];
int len[MAXN + 5];
int main() {
	scanf("%d", &n), s = 0, t = 2*n + 1;
	for(int i=1;i<=n;i++) {
		scanf("%s", S + len[i-1]);
		len[i] = len[i-1] + strlen(S + len[i-1]);
		ac.add(S + len[i-1], len[i] - len[i-1], i);
	}
	ac.build();
	for(int i=1;i<=n;i++)
		G.addedge(s, i, 1), G.addedge(n + i, t, 1);
	for(int i=1;i<=n;i++)
		ac.solve(S + len[i-1], len[i] - len[i-1], i);
	for(int k=1;k<=n;k++)
		for(int i=1;i<=n;i++)
			if( e[i][k] ) e[i] |= e[k];
	for(int i=1;i<=n;i++)
		for(int j=1;j<=n;j++)
			if( e[i][j] ) G.addedge(i, j + n, 1);
	printf("%d
", n - G.max_flow(s, t));
	G.dfs(G.s);
	for(int i=1;i<=n;i++)
		if( G.vis[i] && (!G.vis[i+n]) )
			printf("%d ", i);
}

@details@

不要问为什么 O(n^3) 可以跑 750 的数据范围，问就是 bitset 优化传递闭包 + dinic 跑最大匹配 + codeforces 的机子。
一开始因为不熟悉 bitset（从来没用过 2333）还 WA 了几次。

题目给出的字符集只有 'a'，'b' 其实只是纯粹防止你内存开太大 MLE 而已。

顺便，dinic 求独立集有一个好，就是因为反向边的存在，可以直接判断是否满流然后跑 dfs。