diff --git a/modules/instagram.js b/modules/instagram.js index f14e6ef..5fb3f4f 100644 --- a/modules/instagram.js +++ b/modules/instagram.js @@ -500,4 +500,425 @@ function extractEmbeddedInstagramJSON(response) { } return datas; -} \ No newline at end of file +} + +// === auto-generated by 4cat map_item sync — BLOCK REPLACED AUTOMATICALLY === +// (regenerated from datasources/instagram/search_instagram.py) +export function map_item(item) { + const link = item['link'] ?? ''; + if ((item['product_type'] === 'ad') || (link && link.startsWith('https://www.facebook.com/ads/ig_redirect'))) { + // These are ads + throw new MapItemException('appears to be Instagram ad, check raw data to confirm and ensure Zeeschuimer is up to date.'); + } + + const is_polaris_response = '__typename' in item && 'polaris' in item['__typename'].toLowerCase(); + + const is_graph_response = '__typename' in item && item['__typename'] !== 'XDTMediaDict'; + + if (is_polaris_response) { + return new MappedItem(parse_polaris_item(item)); + } else if (is_graph_response) { + return new MappedItem(parse_graph_item(item)); + } else { + return new MappedItem(parse_itemlist_item(item)); + } +} + +function parse_polaris_item(node) { + const partial_item = node['_zs_partial'] ?? false; + const collected_at = new MissingMappedField(0); + const unix_at = new MissingMappedField(0); + const caption = 'caption' in node ? (node['caption'] ? node['caption']['text'] : '') : new MissingMappedField(''); + + const user = node['user']; + const owner = node['owner'] ?? {}; + if (node['user'] && node['owner']) { + if (owner['id'] === user['id']) { + // Same id; owner may contain less info (e.g. no full name, username, etc.), so prefer user + } else if (user['username'] !== owner['username']) { + throw new MapItemException('Unable to parse item: different user and owner'); + } + } + const is_verified = 'is_verified' in user && user['is_verified'] !== null ? user['is_verified'] : new MissingMappedField(false); + + // media type + const type_map = {'XIGPolarisPhotoMedia': 'photo', 'XIGPolarisVideoMedia': 'video'}; + const media_type = type_map[node['__typename']] ?? 'unknown'; + const num_media = node['__typename'] !== 'XIGPolarisCarouselMedia' ? 1 : (node['carousel_media'] ? node['carousel_media'].length : 0); + + // get media urls + const display_urls = node['display_uri'] ?? new MissingMappedField(''); + let missing_media = null; + let media_urls = ''; + if ('video_versions' in node) { + media_urls = node['video_versions'][0]['url'] ?? new MissingMappedField(''); + } else { + media_urls = new MissingMappedField(''); + } + + const mapped_item = { + // Post and caption + 'collected_from_url': normalize_url_encoding(node['__import_meta']?.['source_platform_url']), // Zeeschuimer metadata + 'collected_from_view': node['_zs_instagram_view'] ?? '', + 'partial_item': partial_item, + 'id': node['code'], + 'timestamp': collected_at, + 'thread_id': node['code'], + 'parent_id': node['code'], + 'url': `https://www.instagram.com/p/${node['code']}`, + 'body': caption, + + // Authors + 'author_id': user['id'] ?? owner['id'] ?? new MissingMappedField(''), // This should always be present + 'author': user['username'] ?? owner['username'] ?? new MissingMappedField(''), + 'author_fullname': user['full_name'] ?? owner['full_name'] ?? new MissingMappedField(''), + 'verified': is_verified, + 'author_avatar_url': user['profile_pic_url'] ?? owner['profile_pic_url'] ?? new MissingMappedField(''), + + // Not available in this format + 'coauthors': new MissingMappedField(''), + 'coauthor_fullnames': new MissingMappedField(''), + 'coauthor_ids': new MissingMappedField(''), + + // Media + 'media_type': media_type, + 'num_media': num_media, + 'image_urls': display_urls, + 'media_urls': media_urls, + + // Engagement + 'hashtags': extract_hashtags(caption), + 'usertags': new MissingMappedField(''), // Not available in this format + 'play_count': node['play_count'] ?? new MissingMappedField(0), + + 'likes_hidden': new MissingMappedField(''), // Not available in this format + 'num_likes': new MissingMappedField(0), + 'num_comments': new MissingMappedField(0), + + // Location not available (even for location tags) + 'location_name': new MissingMappedField(''), + 'location_id': new MissingMappedField(''), + 'location_latlong': new MissingMappedField(''), + 'location_city': new MissingMappedField(''), + + // Metadata + 'unix_timestamp': unix_at, + 'missing_media': missing_media, // This denotes media that is unable to be mapped and is otherwise None + }; + + return mapped_item; +} + +function parse_graph_item(node) { + let caption = ''; + try { + caption = node['edge_media_to_caption']['edges'][0]['node']['text']; + } catch (e) { + caption = new MissingMappedField(''); + } + + const num_media = node['__typename'] !== 'GraphSidecar' ? 1 : (node['edge_sidecar_to_children'] ? node['edge_sidecar_to_children']['edges'].length : 0); + + // get media url + let media_node; + if (node['__typename'] === 'GraphSidecar') { + media_node = node['edge_sidecar_to_children']['edges'][0]['node']; + } else { + media_node = node; + } + + let media_url = ''; + if (media_node['__typename'] === 'GraphVideo') { + media_url = media_node['video_url']; + } else if (media_node['__typename'] === 'GraphImage') { + const resources = media_node['display_resources'] ?? media_node['thumbnail_resources']; + try { + media_url = resources.pop()['src']; + } catch (e) { + media_url = media_node['display_url'] ?? ''; + } + } else { + media_url = media_node['display_url']; + } + + // type, 'mixed' means carousel with video and photo + const type_map = {'GraphSidecar': 'photo', 'GraphVideo': 'video'}; + let media_type; + if (node['__typename'] !== 'GraphSidecar') { + media_type = type_map[node['__typename']] ?? 'unknown'; + } else { + const media_types = new Set(node['edge_sidecar_to_children']['edges'].map(s => s['node']['__typename'])); + media_type = media_types.size > 1 ? 'mixed' : (type_map[media_types.values().next().value] ?? 'unknown'); + } + + const location = {'name': '', 'latlong': '', 'city': '', 'location_id': ''}; + if (node['location']) { + location['name'] = node['location']['name']; + location['location_id'] = node['location']['pk']; + location['latlong'] = node['location']['lat'] && node['location']['lng'] ? `${node['location']['lat']},${node['location']['lng']}` : ''; + location['city'] = node['location']['city']; + } + + const no_likes = Boolean(node['like_and_view_counts_disabled']); + + const user = node['user']; + const owner = node['owner']; + if (node['user'] && node['owner']) { + if (owner['id'] === user['id']) { + // Same id; owner may contain less info (e.g. no full name, username, etc.), so prefer user + } else if (user['username'] !== owner['username']) { + throw new MapItemException('Unable to parse item: different user and owner'); + } + } + + let play_count = 0; + if (node['view_count'] !== null) { + play_count = node['view_count']; + } else if (node['play_count'] !== null) { + play_count = node['play_count']; + } else { + play_count = new MissingMappedField(0); + } + + const mapped_item = { + // Post data + 'id': node['shortcode'], + 'post_source_domain': normalize_url_encoding(node['__import_meta']?.['source_platform_url']), // Zeeschuimer metadata + 'collected_from_view': node['_zs_instagram_view'] ?? new MissingMappedField(''), + 'partial_item': node['_zs_partial'] ?? new MissingMappedField(''), + 'timestamp': formatUtcTimestamp(node['taken_at_timestamp']), + 'thread_id': node['shortcode'], + 'parent_id': node['shortcode'], + 'url': `https://www.instagram.com/p/${node['shortcode']}`, + 'body': caption, + + + // Author data + 'author': user['username'] ?? owner['username'] ?? new MissingMappedField(''), + 'author_fullname': user['full_name'] ?? owner['full_name'] ?? new MissingMappedField(''), + 'is_verified': Boolean(user['is_verified']), + 'author_avatar_url': user['profile_pic_url'] ?? owner['profile_pic_url'] ?? new MissingMappedField(''), + // Unable to find graph type posts to test + 'coauthors': new MissingMappedField(''), + 'coauthor_fullnames': new MissingMappedField(''), + 'coauthor_ids': new MissingMappedField(''), + + // Media + 'media_type': media_type, + 'num_media': num_media, + 'image_urls': node['display_url'], + 'media_urls': media_url, + + // Engagement + 'hashtags': extract_hashtags(caption), + // Unsure if usertags will work; need data (this could raise it to attention...) + 'usertags': node['edge_media_to_tagged_user'] ? node['edge_media_to_tagged_user']['edges'].map(u => u['node']['user']['username']).join(',') : '', + 'play_count': play_count, + 'likes_hidden': no_likes ? 'yes' : 'no', + 'num_likes': !no_likes ? node['edge_media_preview_like']['count'] : new MissingMappedField(0), + 'num_comments': node['edge_media_preview_comment']?.['count'] ?? 0, + + // Location data + 'location_name': location['name'], + 'location_id': location['location_id'], + 'location_latlong': location['latlong'], + 'location_city': location['city'], + + // Metadata + 'unix_timestamp': node['taken_at_timestamp'], + 'missing_media': null + }; + + return mapped_item; +} + +function parse_itemlist_item(node) { + const partial_item = node['_zs_partial'] ?? false; + const num_media = node['media_type'] !== MEDIA_TYPE_CAROUSEL ? 1 : (node['carousel_media'] ? node['carousel_media'].length : 0); + const caption = 'caption' in node ? (node['caption'] ? node['caption']['text'] : '') : new MissingMappedField(''); + + // get media urls + let display_urls = []; + let media_urls = []; + let missing_media = null; + const type_map = {MEDIA_TYPE_PHOTO: 'photo', MEDIA_TYPE_VIDEO: 'video'}; + const media_types = new Set(); + let media_nodes; + if (node['media_type'] === MEDIA_TYPE_CAROUSEL) { + media_nodes = node['carousel_media']; + } else { + media_nodes = [node]; + } + + for (const media_node of media_nodes) { + if (media_node['media_type'] === MEDIA_TYPE_VIDEO) { + // Get thumbnail + if ('image_versions2' in media_node) { + display_urls.push(media_node['image_versions2']['candidates'][0]['url']); + } else if ('video_versions' in media_node) { + // no image links at all :-/ + // video is all we have + display_urls.push(media_node['video_versions'][0]['url']); + } else { + if (partial_item) { + // Known partial item + } else { + // New format + throw new MapItemException('Instagram item format change'); + } + } + + // Videos if present + if ('video_versions' in media_node) { + media_urls.push(media_node['video_versions'][0]['url']); + } else { + if (partial_item) { + // Known partial item + } else { + // New format + throw new MapItemException('Instagram item format change'); + } + } + } else if (media_node['media_type'] === MEDIA_TYPE_PHOTO && media_node['image_versions2']) { + // Images + const media_url = media_node['image_versions2']['candidates'][0]['url']; + display_urls.push(media_url); + media_urls.push(media_url); + } else { + missing_media = new MissingMappedField(''); + } + + media_types.add(type_map[media_node['media_type']] ?? 'unknown'); + } + + // type, 'mixed' means carousel with video and photo + const media_type = media_types.size > 1 ? 'mixed' : [...media_types][0]; + + let num_comments = -1; + if ('comment_count' in node) { + num_comments = node['comment_count']; + } else if ('comments' in node && Array.isArray(node['comments'])) { + num_comments = node['comments'].length; + } + + const location = {'name': '', 'latlong': '', 'city': '', 'location_id': ''}; + if (node['location']) { + location['name'] = node['location']['name']; + location['location_id'] = node['location']['pk']; + location['latlong'] = node['location']['lat'] && node['location']['lng'] ? `${node['location']['lat']},${node['location']['lng']}` : ''; + location['city'] = node['location']['city']; + } + + const user = node['user'] ?? {}; + const owner = node['owner'] ?? {}; + if (user && owner) { + if (owner['id'] === user['id']) { + // Same id; owner may contain less info (e.g. no full name, username, etc.), so prefer user + } else if (user['username'] !== owner['username']) { + throw new MapItemException('Unable to parse item: different user and owner'); + } + } + + // Instagram posts also allow 'Collabs' with up to one co-author + let coauthors = []; + let coauthor_fullnames = []; + let coauthor_ids = []; + if (node['coauthor_producers']) { + for (const coauthor_node of node['coauthor_producers']) { + coauthors.push(coauthor_node['username'] ?? new MissingMappedField('')); + coauthor_fullnames.push(coauthor_node['full_name'] ?? new MissingMappedField('')); + coauthor_ids.push(coauthor_node['id']); + } + } + coauthors = coauthors.join(','); + coauthor_fullnames = coauthor_fullnames.join(','); + + const no_likes = Boolean(node['like_and_view_counts_disabled']); + + let play_count = 0; + if (node['view_count'] !== null) { + play_count = node['view_count']; + } else if (node['play_count'] !== null) { + play_count = node['play_count']; + } else { + play_count = new MissingMappedField(0); + } + + // usertags + let usertags = ''; + if ('usertags' in node && node['usertags']) { + usertags = node['usertags']['in'].map(user => user['user']['username']).join(','); + } else { + // Not always included; MissingMappedField may be more appropriate, but it flags virtually all posts without tags (some do return `None`) + usertags = ''; + } + + let collected_at; + let unix_at; + if (partial_item) { + // Missing data + collected_at = new MissingMappedField(0); + unix_at = new MissingMappedField(0); + } else { + collected_at = formatUtcTimestamp(node['taken_at']); + unix_at = node['taken_at']; + } + + const mapped_item = { + // Post and caption + 'collected_from_url': normalize_url_encoding(node['__import_meta']?.['source_platform_url']), // Zeeschuimer metadata + 'collected_from_view': node['_zs_instagram_view'] ?? '', + 'partial_item': node['_zs_partial'] ?? '', + 'id': node['code'], + 'timestamp': collected_at, + 'thread_id': node['code'], + 'parent_id': node['code'], + 'url': `https://www.instagram.com/p/${node['code']}`, + 'body': caption, + + // Authors + 'author_id': user['id'] ?? owner['id'] ?? new MissingMappedField(''), // This should always be present + 'author': user['username'] ?? owner['username'] ?? new MissingMappedField(''), + 'author_fullname': user['full_name'] ?? owner['full_name'] ?? new MissingMappedField(''), + 'verified': Boolean(user['is_verified']), + 'author_avatar_url': user['profile_pic_url'] ?? owner['profile_pic_url'] ?? new MissingMappedField(''), + 'coauthors': coauthors, + 'coauthor_fullnames': coauthor_fullnames, + 'coauthor_ids': coauthor_ids.join(','), + + // Media + 'media_type': media_type, + 'num_media': num_media, + 'image_urls': display_urls.join(','), + 'media_urls': media_urls.join(','), + + // Engagement + 'hashtags': extract_hashtags(caption), + 'usertags': usertags, + 'play_count': play_count, + 'likes_hidden': no_likes ? 'yes' : 'no', + 'num_likes': !no_likes ? node['like_count'] : new MissingMappedField(0), + 'num_comments': num_comments, + + // Location + 'location_name': location['name'], + 'location_id': location['location_id'], + 'location_latlong': location['latlong'], + 'location_city': location['city'], + + // Metadata + 'unix_timestamp': unix_at, + 'missing_media': missing_media, // This denotes media that is unable to be mapped and is otherwise None + }; + + return mapped_item; +} + +function extract_hashtags(caption) { + if (caption instanceof MissingMappedField) { + return ''; + } + const hashtagRegex = /#([^- !@#$%ˆ&*()_+{}:"|<>?[];'`~'‘’]+)/g; + return [...caption.matchAll(hashtagRegex)].map(match => match[1]).join(','); +} +// === end auto-generated ===