import requests

rfc1945 = requests.get("https://www.rfc-editor.org/rfc/rfc1945.txt").text


print(rfc1945_excerpt := rfc1945[6:917])

Network Working Group                                     T. Berners-Lee
Request for Comments: 1945                                       MIT/LCS
Category: Informational                                      R. Fielding
                                                               UC Irvine
                                                              H. Frystyk
                                                                 MIT/LCS
                                                                May 1996


                Hypertext Transfer Protocol -- HTTP/1.0

Status of This Memo

   This memo provides information for the Internet community.  This memo
   does not specify an Internet standard of any kind.  Distribution of
   this memo is unlimited.

IESG Note:

   The IESG has concerns about this protocol, and expects this document
   to be replaced relatively soon by a standards track document.


import json

rfc1945_metadata = json.loads(
    requests.get("https://www.rfc-editor.org/rfc/rfc1945.json").content
)


rfc1945_metadata

{
    'draft': '',
    'doc_id': 'RFC1945',
    'title': ' Hypertext Transfer Protocol -- HTTP/1.0 ',
    'authors': ['T. Berners-Lee', 'R. Fielding', 'H. Frystyk'],
    'format': ['ASCII', 'HTML'],
    'page_count': '60',
    'pub_status': 'INFORMATIONAL',
    'status': 'INFORMATIONAL',
    'source': 'HyperText Transfer Protocol',
    'abstract': ' The Hypertext Transfer Protocol (HTTP) is an application-level protocol with the lightness and speed necessary for distributed, collaborative, hypermedia information systems.  This memo provides information for the Internet community.  This memo does not specify an Internet standard of any kind.  ',
    'pub_date': 'April 1996',
    'keywords': ['HTTP-1.0', 'HTTP', 'World-Wide', 'Web', 'application'],
    'obsoletes': [],
    'obsoleted_by': [],
    'updates': [],
    'updated_by': [],
    'see_also': [],
    'doi': '10.17487/RFC1945',
    'errata_url': None
}


import spacy
from spacy import displacy

language_model = "en_core_web_sm"

if not spacy.util.is_package(language_model):
    spacy.cli.download(language_model)

nlp = spacy.load(language_model)


doc = nlp(rfc1945_excerpt)
displacy.render(doc, style="ent")


ruler = nlp.add_pipe("entity_ruler", before="ner")
ruler.add_patterns(
    [
        {"label": "STD_BODY", "pattern": "Network Working Group"},
        {"label": "STD_BODY", "pattern": "IESG"},
    ]
)


doc = nlp(rfc1945_excerpt)
displacy.render(doc, style="ent")


more_complex_pattern

[
    {
        'label': 'STANDARD',
        'pattern': [
            {
                'LOWER': {
                    'IN': [
                        'bcp',
                        'fyi',
                        'ien',
                        'obsoleted',
                        'obsoletes',
                        'request',
                        'rfc',
                        'std',
                        'updated',
                        'updates'
                    ]
                }
            },
            {'LOWER': {'IN': ['for', 'by']}, 'OP': '?'},
            {'TEXT': 'Comments', 'OP': '?'},
            {'IS_PUNCT': True, 'OP': '?'},
            {'IS_DIGIT': True}
        ]
    }
]


ruler.add_patterns(more_complex_pattern)
doc = nlp(rfc1945_excerpt)
displacy.render(doc, style="ent")


import networkx as nx

G_rfc1945 = nx.Graph()
G_rfc1945.add_node(rfc1945_metadata["doc_id"])
G_rfc1945.add_nodes_from(rfc1945_metadata["authors"])
print(G_rfc1945.nodes(data=True))

[('RFC1945', {}), ('T. Berners-Lee', {}), ('R. Fielding', {}), ('H. Frystyk', {})]


graph.show()


G_rfc1945.add_edges_from(
    [(rfc1945_metadata["doc_id"], author) for author in rfc1945_metadata["authors"]]
)
print(G_rfc1945.edges(data=True))

[('RFC1945', 'T. Berners-Lee', {}), ('RFC1945', 'R. Fielding', {}), ('RFC1945', 'H. Frystyk', {})]


graph.show()


from networkx.algorithms import bipartite

bipartite.is_bipartite(G_rfc1945)

True


graph.show()


rfcs_by_tbl_metadata = [
    json.loads(requests.get(f"https://www.rfc-editor.org/rfc/{rfc}.json").content)
    for rfc in [
        "rfc1630",
        "rfc1738",
        "rfc1866",
        "rfc2068",
        "rfc2396",
        "rfc2616",
        "rfc3986",
    ]
]
G_tbl = nx.Graph()
for rfc in rfcs_by_tbl_metadata:
    for author in rfc["authors"]:
        G_tbl.add_edge(rfc["doc_id"], author)


graph.show()


graph.show()


graph.show()


graph.show()


graph.show()


degree_centrality


graph.show()

'Graph with 1141 nodes and 2201 edges'


graph.show()

'Graph with 651 nodes and 1758 edges'


G_94_99_uni_giant_communities


G_94_99_uni_giant_author_measures.round(3)


G_94_99_uni_giant_communities_leaders.round(3)


most_important_authors_shortest_path_matrix.astype(int)


most_important_authors_shortest_path_matrix_color.show()

	author	degree centrality
0	T. Berners-Lee	1.000
1	L. Masinter	0.875
2	P. Leach	0.750
3	R. Fielding	0.750
4	J. Mogul	0.750
5	H. Frystyk	0.750
6	J. Gettys	0.750
7	M. McCahill	0.250
8	D. Connolly	0.125

	author	degree centrality	betweenness centrality	eigenvector centrality	pagerank	community
0	S. Deering	0.051	0.121	0.282	0.007	0
1	Y. Rekhter	0.051	0.142	0.125	0.008	1
2	L. Zhang	0.045	0.052	0.259	0.005	0
3	F. Baker	0.043	0.134	0.090	0.006	0
4	V. Jacobson	0.042	0.076	0.261	0.005	0
...	...	...	...	...	...	...
646	C. Burton	0.002	0.000	0.000	0.000	4
647	M. Beadles	0.002	0.000	0.000	0.000	15
648	D. Perkins	0.002	0.000	0.000	0.001	16
649	T. Henderson	0.002	0.000	0.014	0.000	0
650	D. Stenerson	0.002	0.000	0.000	0.000	3

	author count	author	eigenvector centrality
community
0	96	S. Deering	0.282
1	86	Y. Rekhter	0.125
2	60	J. Postel	0.065
3	59	M. Kosters	0.007
4	46	B. Carpenter	0.029
5	36	J. Mogul	0.021
6	34	W. Simpson	0.000
7	32	M. Allman	0.036
8	30	R. Hinden	0.069
9	27	H. Schulzrinne	0.031
10	26	H. Alvestrand	0.012
11	25	G. Parsons	0.004
12	24	P. Hoffman	0.001
13	20	C. Perkins	0.010
14	19	A. Smith	0.001
15	13	B. Aboba	0.000
16	9	P. Vixie	0.001
17	5	A. Orda	0.003
18	4	R. Coltun	0.006

	S. Deering	Y. Rekhter	J. Postel	M. Kosters	B. Carpenter	J. Mogul	W. Simpson	M. Allman	R. Hinden	H. Schulzrinne	H. Alvestrand	G. Parsons	P. Hoffman	C. Perkins	A. Smith	B. Aboba	P. Vixie	A. Orda	R. Coltun
S. Deering	0	1	1	2	2	1	3	2	1	2	2	3	3	2	4	6	3	3	3
Y. Rekhter	1	0	1	2	1	2	3	3	1	3	3	3	3	3	4	6	2	4	3
J. Postel	1	1	0	1	2	2	4	3	1	3	3	3	3	3	4	7	3	4	3
M. Kosters	2	2	1	0	3	3	5	4	2	4	4	4	4	4	5	8	3	5	4
B. Carpenter	2	1	2	3	0	2	4	3	1	3	2	4	2	4	4	7	3	4	4
J. Mogul	1	2	2	3	2	0	4	3	2	3	2	4	2	3	5	7	4	4	4
W. Simpson	3	3	4	5	4	4	0	4	3	5	2	4	4	5	5	3	5	5	4
M. Allman	2	3	3	4	3	3	4	0	3	3	3	3	5	4	4	7	5	3	4
R. Hinden	1	1	1	2	1	2	3	3	0	3	2	3	2	3	4	6	3	4	3
H. Schulzrinne	2	3	3	4	3	3	5	3	3	0	4	4	5	2	5	8	5	4	4
H. Alvestrand	2	3	3	4	2	2	2	3	2	4	0	2	2	4	3	5	4	5	3
G. Parsons	3	3	3	4	4	4	4	3	3	4	2	0	4	5	4	7	5	5	3
P. Hoffman	3	3	3	4	2	2	4	5	2	5	2	4	0	5	5	7	5	6	5
C. Perkins	2	3	3	4	4	3	5	4	3	2	4	5	5	0	5	8	5	5	4
A. Smith	4	4	4	5	4	5	5	4	4	5	3	4	5	5	0	8	6	4	3
B. Aboba	6	6	7	8	7	7	3	7	6	8	5	7	7	8	8	0	8	8	7
P. Vixie	3	2	3	3	3	4	5	5	3	5	4	5	5	5	6	8	0	6	5
A. Orda	3	4	4	5	4	4	5	3	4	4	5	5	6	5	4	8	6	0	4
R. Coltun	3	3	3	4	4	4	4	4	3	4	3	3	5	4	3	7	5	4	0

	community	author count
0	0	96
1	1	86
2	2	60
3	3	59
4	4	46
5	5	36
6	6	34
7	7	32
8	8	30
9	9	27
10	10	26
11	11	25
12	12	24
13	13	20
14	14	19
15	15	13
16	16	9
17	17	5
18	18	4

	community	author count
0	0	96
1	1	86
2	2	60
3	3	59
4	4	46
5	5	36
6	6	34
7	7	32
8	8	30
9	9	27
10	10	26
11	11	25
12	12	24
13	13	20
14	14	19
15	15	13
16	16	9
17	17	5
18	18	4

The corpus as a network¶

Overview¶

The Evolution of Internet Governance¶

The RFC Editor¶

An example: RFC 1945¶

Retrieving the source document¶

Looking at the source document¶

Some remarks about the source document¶

Retrieving the metadata¶

Looking at the metadata¶

More metadata¶

Caveats¶

Named Entity Recognition (NER) with a pre-trained model¶

Named Entity Recognition (NER) with a pre-trained model¶

Named Entity Recognition (NER) with a pre-trained model¶

Adding custom rules to the NER pipeline¶

Adding more complex rules to the NER pipeline¶

Adding more complex rules to the NER pipeline¶

Turning RFC 1945 into a graph¶

Adding nodes to the graph¶

Adding nodes to the graph¶

Adding edges to the graph¶

Adding edges to the graph¶

Properties of this graph¶

Draw the graph as a bipartite graph¶

Adding more standards by Tim Berners-Lee to the graph¶

Adding more standards by Tim Berners-Lee to the graph¶

Adding more standards by Tim Berners-Lee to the graph¶

From authorship to co-authorship¶

Turning a bipartite graph into a unipartite graph¶

Co-authors of RFC 1945¶

All co-authors of Tim Berners-Lee¶

All co-authors of Tim Berners-Lee¶

All co-authors of Tim Berners-Lee¶

All co-authors of Tim Berners-Lee¶

Analyzing the graph¶

Analyzing a bigger network¶

A word about graph components¶

Analyzing the largest connected component¶

Analysis of the largest related component¶

Analyzing the community structure¶

Analyzing the community structure¶

A word about centrality measures¶

Analyzing the community structure¶

Analyzing the community structure¶

The distance between two communities and their "leaders"¶

Recap¶

The one book everyone should read¶

Thank you for your attention!¶

	community	author count
0	0	96
1	1	86
2	2	60
3	3	59
4	4	46
5	5	36
6	6	34
7	7	32
8	8	30
9	9	27
10	10	26
11	11	25
12	12	24
13	13	20
14	14	19
15	15	13
16	16	9
17	17	5
18	18	4